diff --git "a/sft/665K_Memory/Full_xmoe/checkpoint-6656/trainer_state.json" "b/sft/665K_Memory/Full_xmoe/checkpoint-6656/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft/665K_Memory/Full_xmoe/checkpoint-6656/trainer_state.json" @@ -0,0 +1,113185 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8003366800937894, + "eval_steps": 500, + "global_step": 6656, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.03016264, + "auxiliary_loss_mlp": 0.02721907, + "balance_loss_clip": 2.50513363, + "balance_loss_mlp": 2.24049187, + "epoch": 0.00012024289063909097, + "flos": 24934766436360.0, + "grad_norm": 39.95509143258904, + "language_loss": 2.58289957, + "learning_rate": 0.0, + "loss": 1.89537072, + "num_input_tokens_seen": 20375, + "router_z_loss_clip": 5.11132812, + "router_z_loss_mlp": 4.81445312, + "step": 1, + "time_per_iteration": 20.919223308563232 + }, + { + "auxiliary_loss_clip": 0.02013491, + "auxiliary_loss_mlp": 0.01768048, + "balance_loss_clip": 1.67237806, + "balance_loss_mlp": 1.44341683, + "epoch": 0.00024048578127818193, + "flos": 30667271509440.0, + "grad_norm": 54.70626630446235, + "language_loss": 1.88788605, + "learning_rate": 5.021476677069823e-07, + "loss": 1.9257015, + "num_input_tokens_seen": 39035, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 3.24609375, + "step": 2, + "time_per_iteration": 2.9709692001342773 + }, + { + "auxiliary_loss_clip": 0.02007318, + "auxiliary_loss_mlp": 0.01823467, + "balance_loss_clip": 1.66505527, + "balance_loss_mlp": 1.51104307, + "epoch": 0.0003607286719172729, + "flos": 19028254616640.0, + "grad_norm": 40.32383004470566, + "language_loss": 1.61467648, + "learning_rate": 7.958852231401551e-07, + "loss": 1.65298426, + "num_input_tokens_seen": 57600, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 3.12109375, + "step": 3, + "time_per_iteration": 2.917240858078003 + }, + { + "auxiliary_loss_clip": 0.02015862, + "auxiliary_loss_mlp": 0.01795723, + "balance_loss_clip": 1.67673182, + "balance_loss_mlp": 1.47013855, + "epoch": 0.00048097156255636386, + "flos": 19318256242560.0, + "grad_norm": 36.55198667809081, + "language_loss": 1.64376903, + "learning_rate": 1.0042953354139647e-06, + "loss": 1.68188477, + "num_input_tokens_seen": 76465, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 3.25585938, + "step": 4, + "time_per_iteration": 3.1182327270507812 + }, + { + "auxiliary_loss_clip": 0.01996712, + "auxiliary_loss_mlp": 0.0181227, + "balance_loss_clip": 1.65542221, + "balance_loss_mlp": 1.49584103, + "epoch": 0.0006012144531954548, + "flos": 13993013531520.0, + "grad_norm": 58.72492907563633, + "language_loss": 1.94096088, + "learning_rate": 1.1659507774310057e-06, + "loss": 1.97905064, + "num_input_tokens_seen": 94350, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 3.1640625, + "step": 5, + "time_per_iteration": 3.2286198139190674 + }, + { + "auxiliary_loss_clip": 0.02003315, + "auxiliary_loss_mlp": 0.01814967, + "balance_loss_clip": 1.66176546, + "balance_loss_mlp": 1.50063622, + "epoch": 0.0007214573438345458, + "flos": 23151238489440.0, + "grad_norm": 46.279782434257434, + "language_loss": 1.61122084, + "learning_rate": 1.2980328908471373e-06, + "loss": 1.64940381, + "num_input_tokens_seen": 114595, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 3.140625, + "step": 6, + "time_per_iteration": 3.255627393722534 + }, + { + "auxiliary_loss_clip": 0.02502186, + "auxiliary_loss_mlp": 0.01947306, + "balance_loss_clip": 2.16045308, + "balance_loss_mlp": 1.67875099, + "epoch": 0.0008417002344736367, + "flos": 67670125892640.0, + "grad_norm": 4.609227711156207, + "language_loss": 0.81528807, + "learning_rate": 1.4097067265369432e-06, + "loss": 0.85978293, + "num_input_tokens_seen": 179590, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 2.6875, + "step": 7, + "time_per_iteration": 3.5490164756774902 + }, + { + "auxiliary_loss_clip": 0.0200666, + "auxiliary_loss_mlp": 0.01797744, + "balance_loss_clip": 1.66586781, + "balance_loss_mlp": 1.47921622, + "epoch": 0.0009619431251127277, + "flos": 21283144244640.0, + "grad_norm": 41.28903353829093, + "language_loss": 1.58680582, + "learning_rate": 1.506443003120947e-06, + "loss": 1.6248498, + "num_input_tokens_seen": 195090, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 3.18554688, + "step": 8, + "time_per_iteration": 3.0158424377441406 + }, + { + "auxiliary_loss_clip": 0.02014212, + "auxiliary_loss_mlp": 0.01792749, + "balance_loss_clip": 1.67356277, + "balance_loss_mlp": 1.47384071, + "epoch": 0.0010821860157518186, + "flos": 23333598904320.0, + "grad_norm": 17.757453204926094, + "language_loss": 1.47819126, + "learning_rate": 1.5917704462803102e-06, + "loss": 1.51626086, + "num_input_tokens_seen": 211635, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 3.1875, + "step": 9, + "time_per_iteration": 3.0700607299804688 + }, + { + "auxiliary_loss_clip": 0.02019913, + "auxiliary_loss_mlp": 0.01841052, + "balance_loss_clip": 1.67992949, + "balance_loss_mlp": 1.52500391, + "epoch": 0.0012024289063909096, + "flos": 17011556383680.0, + "grad_norm": 13.289898433553955, + "language_loss": 1.53208685, + "learning_rate": 1.6680984451379884e-06, + "loss": 1.57069659, + "num_input_tokens_seen": 224705, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 3.15820312, + "step": 10, + "time_per_iteration": 3.147230625152588 + }, + { + "auxiliary_loss_clip": 0.0201367, + "auxiliary_loss_mlp": 0.01847062, + "balance_loss_clip": 1.67349458, + "balance_loss_mlp": 1.53177714, + "epoch": 0.0013226717970300007, + "flos": 21290274738720.0, + "grad_norm": 13.355373076275715, + "language_loss": 1.32454264, + "learning_rate": 1.7371455188905097e-06, + "loss": 1.36314988, + "num_input_tokens_seen": 244635, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 3.15429688, + "step": 11, + "time_per_iteration": 3.093308687210083 + }, + { + "auxiliary_loss_clip": 0.02006835, + "auxiliary_loss_mlp": 0.01779936, + "balance_loss_clip": 1.66536188, + "balance_loss_mlp": 1.45797515, + "epoch": 0.0014429146876690916, + "flos": 27240503863680.0, + "grad_norm": 10.803015472025615, + "language_loss": 1.25211954, + "learning_rate": 1.8001805585541196e-06, + "loss": 1.28998733, + "num_input_tokens_seen": 265765, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 3.21875, + "step": 12, + "time_per_iteration": 3.1287357807159424 + }, + { + "auxiliary_loss_clip": 0.02009864, + "auxiliary_loss_mlp": 0.01749193, + "balance_loss_clip": 1.6686039, + "balance_loss_mlp": 1.41903102, + "epoch": 0.0015631575783081825, + "flos": 19064172948480.0, + "grad_norm": 6.879314970179564, + "language_loss": 1.29130459, + "learning_rate": 1.8581671739548328e-06, + "loss": 1.32889521, + "num_input_tokens_seen": 283500, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 3.30078125, + "step": 13, + "time_per_iteration": 3.0814502239227295 + }, + { + "auxiliary_loss_clip": 0.02008753, + "auxiliary_loss_mlp": 0.01787537, + "balance_loss_clip": 1.66792011, + "balance_loss_mlp": 1.47244298, + "epoch": 0.0016834004689472734, + "flos": 48142997451360.0, + "grad_norm": 6.160973657127663, + "language_loss": 1.13423204, + "learning_rate": 1.9118543942439254e-06, + "loss": 1.17219496, + "num_input_tokens_seen": 305685, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 3.1484375, + "step": 14, + "time_per_iteration": 3.3178937435150146 + }, + { + "auxiliary_loss_clip": 0.02014139, + "auxiliary_loss_mlp": 0.01803662, + "balance_loss_clip": 1.67383575, + "balance_loss_mlp": 1.48093832, + "epoch": 0.0018036433595863645, + "flos": 34972919222400.0, + "grad_norm": 5.669659050489823, + "language_loss": 1.12897277, + "learning_rate": 1.961836000571161e-06, + "loss": 1.16715074, + "num_input_tokens_seen": 327340, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 3.2265625, + "step": 15, + "time_per_iteration": 3.230517864227295 + }, + { + "auxiliary_loss_clip": 0.02484334, + "auxiliary_loss_mlp": 0.01821526, + "balance_loss_clip": 2.14237428, + "balance_loss_mlp": 1.52092743, + "epoch": 0.0019238862502254555, + "flos": 59773935281760.0, + "grad_norm": 3.7895697281449205, + "language_loss": 0.64669514, + "learning_rate": 2.0085906708279293e-06, + "loss": 0.68975377, + "num_input_tokens_seen": 382710, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 3.0, + "step": 16, + "time_per_iteration": 3.534778118133545 + }, + { + "auxiliary_loss_clip": 0.02014589, + "auxiliary_loss_mlp": 0.01812367, + "balance_loss_clip": 1.6731441, + "balance_loss_mlp": 1.49212337, + "epoch": 0.0020441291408645466, + "flos": 20816698996800.0, + "grad_norm": 6.095941354023679, + "language_loss": 1.15964818, + "learning_rate": 2.0525099325728135e-06, + "loss": 1.19791758, + "num_input_tokens_seen": 400890, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 3.203125, + "step": 17, + "time_per_iteration": 7.071341276168823 + }, + { + "auxiliary_loss_clip": 0.02475809, + "auxiliary_loss_mlp": 0.01746086, + "balance_loss_clip": 2.13357472, + "balance_loss_mlp": 1.43328094, + "epoch": 0.0021643720315036373, + "flos": 63863011015200.0, + "grad_norm": 3.5352718235357266, + "language_loss": 0.7219156, + "learning_rate": 2.0939181139872922e-06, + "loss": 0.76413459, + "num_input_tokens_seen": 462605, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 3.125, + "step": 18, + "time_per_iteration": 3.4298877716064453 + }, + { + "auxiliary_loss_clip": 0.01999548, + "auxiliary_loss_mlp": 0.01756563, + "balance_loss_clip": 1.65816784, + "balance_loss_mlp": 1.43422151, + "epoch": 0.0022846149221427284, + "flos": 31287668616000.0, + "grad_norm": 6.3446638710747685, + "language_loss": 1.01640153, + "learning_rate": 2.1330868934640175e-06, + "loss": 1.05396259, + "num_input_tokens_seen": 483280, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 3.22265625, + "step": 19, + "time_per_iteration": 3.296417713165283 + }, + { + "auxiliary_loss_clip": 0.02466781, + "auxiliary_loss_mlp": 0.01676304, + "balance_loss_clip": 2.12446809, + "balance_loss_mlp": 1.35586929, + "epoch": 0.002404857812781819, + "flos": 51088755071520.0, + "grad_norm": 3.5777489877272095, + "language_loss": 0.76467323, + "learning_rate": 2.170246112844971e-06, + "loss": 0.80610406, + "num_input_tokens_seen": 537620, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 3.203125, + "step": 20, + "time_per_iteration": 3.2324845790863037 + }, + { + "auxiliary_loss_clip": 0.02000369, + "auxiliary_loss_mlp": 0.01704296, + "balance_loss_clip": 1.65802777, + "balance_loss_mlp": 1.37337112, + "epoch": 0.0025251007034209102, + "flos": 15817186602720.0, + "grad_norm": 4.014321284907789, + "language_loss": 1.01563072, + "learning_rate": 2.2055919496770983e-06, + "loss": 1.05267739, + "num_input_tokens_seen": 555760, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 3.31054688, + "step": 21, + "time_per_iteration": 3.0309150218963623 + }, + { + "auxiliary_loss_clip": 0.01986076, + "auxiliary_loss_mlp": 0.01776348, + "balance_loss_clip": 1.64365792, + "balance_loss_mlp": 1.45591354, + "epoch": 0.0026453435940600014, + "flos": 37854881313120.0, + "grad_norm": 3.604532530022995, + "language_loss": 0.89556372, + "learning_rate": 2.2392931865974923e-06, + "loss": 0.93318796, + "num_input_tokens_seen": 578450, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 3.203125, + "step": 22, + "time_per_iteration": 3.1658756732940674 + }, + { + "auxiliary_loss_clip": 0.01985629, + "auxiliary_loss_mlp": 0.0168748, + "balance_loss_clip": 1.64354253, + "balance_loss_mlp": 1.35579193, + "epoch": 0.002765586484699092, + "flos": 21144098152800.0, + "grad_norm": 4.132298161715142, + "language_loss": 1.01810586, + "learning_rate": 2.271496085962064e-06, + "loss": 1.05483699, + "num_input_tokens_seen": 596145, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 3.31835938, + "step": 23, + "time_per_iteration": 3.077746629714966 + }, + { + "auxiliary_loss_clip": 0.01971234, + "auxiliary_loss_mlp": 0.01790395, + "balance_loss_clip": 1.62943876, + "balance_loss_mlp": 1.45851624, + "epoch": 0.002885829375338183, + "flos": 20669460422400.0, + "grad_norm": 4.363443666200237, + "language_loss": 1.02589679, + "learning_rate": 2.3023282262611022e-06, + "loss": 1.06351316, + "num_input_tokens_seen": 614920, + "router_z_loss_clip": 3.41992188, + "router_z_loss_mlp": 3.31835938, + "step": 24, + "time_per_iteration": 3.094956636428833 + }, + { + "auxiliary_loss_clip": 0.01963177, + "auxiliary_loss_mlp": 0.01739985, + "balance_loss_clip": 1.6202873, + "balance_loss_mlp": 1.40867889, + "epoch": 0.003006072265977274, + "flos": 34826742636480.0, + "grad_norm": 3.471980161418326, + "language_loss": 0.92505765, + "learning_rate": 2.3319015548620114e-06, + "loss": 0.9620893, + "num_input_tokens_seen": 636060, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 3.3125, + "step": 25, + "time_per_iteration": 3.136237382888794 + }, + { + "auxiliary_loss_clip": 0.01964595, + "auxiliary_loss_mlp": 0.01691553, + "balance_loss_clip": 1.62217665, + "balance_loss_mlp": 1.35299826, + "epoch": 0.003126315156616365, + "flos": 24424120290240.0, + "grad_norm": 2.4080108336636656, + "language_loss": 0.92923236, + "learning_rate": 2.3603148416618152e-06, + "loss": 0.96579385, + "num_input_tokens_seen": 655575, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 3.38671875, + "step": 26, + "time_per_iteration": 3.0078563690185547 + }, + { + "auxiliary_loss_clip": 0.01966518, + "auxiliary_loss_mlp": 0.01733374, + "balance_loss_clip": 1.62463284, + "balance_loss_mlp": 1.39424753, + "epoch": 0.003246558047255456, + "flos": 23624548734240.0, + "grad_norm": 2.2372672360871317, + "language_loss": 1.01072609, + "learning_rate": 2.3876556694204647e-06, + "loss": 1.04772496, + "num_input_tokens_seen": 675730, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 3.39257812, + "step": 27, + "time_per_iteration": 3.061936378479004 + }, + { + "auxiliary_loss_clip": 0.01955298, + "auxiliary_loss_mlp": 0.01700965, + "balance_loss_clip": 1.61392713, + "balance_loss_mlp": 1.35535383, + "epoch": 0.003366800937894547, + "flos": 17822127106080.0, + "grad_norm": 13.173534686300465, + "language_loss": 0.90799296, + "learning_rate": 2.414002061950908e-06, + "loss": 0.94455552, + "num_input_tokens_seen": 694605, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 3.45703125, + "step": 28, + "time_per_iteration": 3.1656477451324463 + }, + { + "auxiliary_loss_clip": 0.01961072, + "auxiliary_loss_mlp": 0.01709873, + "balance_loss_clip": 1.62008786, + "balance_loss_mlp": 1.37208176, + "epoch": 0.003487043828533638, + "flos": 24428444100480.0, + "grad_norm": 2.4402985355753306, + "language_loss": 1.00029325, + "learning_rate": 2.4394238264681557e-06, + "loss": 1.0370028, + "num_input_tokens_seen": 714340, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 3.37890625, + "step": 29, + "time_per_iteration": 3.1484155654907227 + }, + { + "auxiliary_loss_clip": 0.01952085, + "auxiliary_loss_mlp": 0.01618687, + "balance_loss_clip": 1.61045456, + "balance_loss_mlp": 1.26697218, + "epoch": 0.003607286719172729, + "flos": 26142700271040.0, + "grad_norm": 2.1842132825276552, + "language_loss": 0.99467564, + "learning_rate": 2.4639836682781433e-06, + "loss": 1.03038335, + "num_input_tokens_seen": 734470, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 3.51757812, + "step": 30, + "time_per_iteration": 3.1288211345672607 + }, + { + "auxiliary_loss_clip": 0.01974821, + "auxiliary_loss_mlp": 0.01597634, + "balance_loss_clip": 1.63409805, + "balance_loss_mlp": 1.24363041, + "epoch": 0.00372752960981182, + "flos": 20595082572000.0, + "grad_norm": 2.388254858676881, + "language_loss": 1.00244915, + "learning_rate": 2.487738122623307e-06, + "loss": 1.03817368, + "num_input_tokens_seen": 753380, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 3.54492188, + "step": 31, + "time_per_iteration": 3.1936864852905273 + }, + { + "auxiliary_loss_clip": 0.01943461, + "auxiliary_loss_mlp": 0.01642625, + "balance_loss_clip": 1.60161161, + "balance_loss_mlp": 1.29510653, + "epoch": 0.003847772500450911, + "flos": 22676828328000.0, + "grad_norm": 3.188641004034387, + "language_loss": 0.99093771, + "learning_rate": 2.510738338534912e-06, + "loss": 1.02679861, + "num_input_tokens_seen": 772105, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 3.4765625, + "step": 32, + "time_per_iteration": 3.0259711742401123 + }, + { + "auxiliary_loss_clip": 0.01917195, + "auxiliary_loss_mlp": 0.01598447, + "balance_loss_clip": 1.57500803, + "balance_loss_mlp": 1.24215388, + "epoch": 0.003968015391090002, + "flos": 17969744962080.0, + "grad_norm": 2.310547499831173, + "language_loss": 1.02526796, + "learning_rate": 2.5330307420306648e-06, + "loss": 1.06042433, + "num_input_tokens_seen": 788955, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 3.5625, + "step": 33, + "time_per_iteration": 3.011826992034912 + }, + { + "auxiliary_loss_clip": 0.01926547, + "auxiliary_loss_mlp": 0.01658053, + "balance_loss_clip": 1.58516383, + "balance_loss_mlp": 1.30843604, + "epoch": 0.004088258281729093, + "flos": 27306347878080.0, + "grad_norm": 2.1565493735277173, + "language_loss": 0.88170469, + "learning_rate": 2.554657600279796e-06, + "loss": 0.91755074, + "num_input_tokens_seen": 810230, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 3.49804688, + "step": 34, + "time_per_iteration": 3.0991456508636475 + }, + { + "auxiliary_loss_clip": 0.01917029, + "auxiliary_loss_mlp": 0.01656444, + "balance_loss_clip": 1.57524753, + "balance_loss_mlp": 1.30434728, + "epoch": 0.004208501172368184, + "flos": 23260851964800.0, + "grad_norm": 2.1693633473192517, + "language_loss": 1.03493237, + "learning_rate": 2.5756575039679493e-06, + "loss": 1.07066715, + "num_input_tokens_seen": 829780, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 3.5234375, + "step": 35, + "time_per_iteration": 3.1044154167175293 + }, + { + "auxiliary_loss_clip": 0.01904751, + "auxiliary_loss_mlp": 0.01594238, + "balance_loss_clip": 1.56154799, + "balance_loss_mlp": 1.23298645, + "epoch": 0.0043287440630072746, + "flos": 17314074302400.0, + "grad_norm": 1.9022498808099926, + "language_loss": 0.95089483, + "learning_rate": 2.5960657816942747e-06, + "loss": 0.98588467, + "num_input_tokens_seen": 848695, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 3.61328125, + "step": 36, + "time_per_iteration": 3.118865489959717 + }, + { + "auxiliary_loss_clip": 0.02325284, + "auxiliary_loss_mlp": 0.02036487, + "balance_loss_clip": 1.98413014, + "balance_loss_mlp": 1.63670731, + "epoch": 0.004448986953646365, + "flos": 53097829744320.0, + "grad_norm": 1.536677628837017, + "language_loss": 0.60954666, + "learning_rate": 2.6159148575788668e-06, + "loss": 0.65316439, + "num_input_tokens_seen": 906730, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 3.984375, + "step": 37, + "time_per_iteration": 3.5251975059509277 + }, + { + "auxiliary_loss_clip": 0.01899781, + "auxiliary_loss_mlp": 0.01593303, + "balance_loss_clip": 1.55648518, + "balance_loss_mlp": 1.24044299, + "epoch": 0.004569229844285457, + "flos": 13445894358720.0, + "grad_norm": 2.698838968797351, + "language_loss": 0.98714495, + "learning_rate": 2.635234561171e-06, + "loss": 1.02207577, + "num_input_tokens_seen": 925125, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 3.52929688, + "step": 38, + "time_per_iteration": 3.159536600112915 + }, + { + "auxiliary_loss_clip": 0.01898855, + "auxiliary_loss_mlp": 0.01540233, + "balance_loss_clip": 1.55646133, + "balance_loss_mlp": 1.17573869, + "epoch": 0.0046894727349245475, + "flos": 16211074551840.0, + "grad_norm": 2.5542410735694703, + "language_loss": 0.94070894, + "learning_rate": 2.6540523970949877e-06, + "loss": 0.9750998, + "num_input_tokens_seen": 939970, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 3.64257812, + "step": 39, + "time_per_iteration": 3.0634350776672363 + }, + { + "auxiliary_loss_clip": 0.01903629, + "auxiliary_loss_mlp": 0.01625578, + "balance_loss_clip": 1.56068945, + "balance_loss_mlp": 1.27043021, + "epoch": 0.004809715625563638, + "flos": 23916484696320.0, + "grad_norm": 2.670383641412227, + "language_loss": 0.92526174, + "learning_rate": 2.6723937805519533e-06, + "loss": 0.96055377, + "num_input_tokens_seen": 957470, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 3.55078125, + "step": 40, + "time_per_iteration": 3.0895140171051025 + }, + { + "auxiliary_loss_clip": 0.01894388, + "auxiliary_loss_mlp": 0.01582762, + "balance_loss_clip": 1.55202973, + "balance_loss_mlp": 1.22513461, + "epoch": 0.00492995851620273, + "flos": 20774901800160.0, + "grad_norm": 2.230405175878623, + "language_loss": 0.93079782, + "learning_rate": 2.690282243737839e-06, + "loss": 0.96556932, + "num_input_tokens_seen": 976405, + "router_z_loss_clip": 3.42382812, + "router_z_loss_mlp": 3.57421875, + "step": 41, + "time_per_iteration": 3.2809793949127197 + }, + { + "auxiliary_loss_clip": 0.01875626, + "auxiliary_loss_mlp": 0.01580582, + "balance_loss_clip": 1.5327127, + "balance_loss_mlp": 1.22543406, + "epoch": 0.0050502014068418205, + "flos": 20340695852640.0, + "grad_norm": 5.07365449073193, + "language_loss": 0.99401236, + "learning_rate": 2.7077396173840807e-06, + "loss": 1.02857447, + "num_input_tokens_seen": 994690, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 3.55273438, + "step": 42, + "time_per_iteration": 3.01865816116333 + }, + { + "auxiliary_loss_clip": 0.01877864, + "auxiliary_loss_mlp": 0.01616998, + "balance_loss_clip": 1.53412855, + "balance_loss_mlp": 1.25269485, + "epoch": 0.005170444297480911, + "flos": 25997282248320.0, + "grad_norm": 2.4368327037345074, + "language_loss": 0.92782581, + "learning_rate": 2.7247861909342594e-06, + "loss": 0.96277446, + "num_input_tokens_seen": 1015615, + "router_z_loss_clip": 3.43945312, + "router_z_loss_mlp": 3.64257812, + "step": 43, + "time_per_iteration": 3.105135679244995 + }, + { + "auxiliary_loss_clip": 0.01883571, + "auxiliary_loss_mlp": 0.0156522, + "balance_loss_clip": 1.53920245, + "balance_loss_mlp": 1.19405007, + "epoch": 0.005290687188120003, + "flos": 20955858873120.0, + "grad_norm": 2.925582888181357, + "language_loss": 0.82870626, + "learning_rate": 2.7414408543044743e-06, + "loss": 0.86319411, + "num_input_tokens_seen": 1031255, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 3.70898438, + "step": 44, + "time_per_iteration": 4.803717374801636 + }, + { + "auxiliary_loss_clip": 0.01866224, + "auxiliary_loss_mlp": 0.01568474, + "balance_loss_clip": 1.52137375, + "balance_loss_mlp": 1.21065581, + "epoch": 0.005410930078759093, + "flos": 15853218719040.0, + "grad_norm": 5.325549080618152, + "language_loss": 0.79468906, + "learning_rate": 2.7577212237113157e-06, + "loss": 0.82903606, + "num_input_tokens_seen": 1048295, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 3.58007812, + "step": 45, + "time_per_iteration": 4.044114112854004 + }, + { + "auxiliary_loss_clip": 0.01874422, + "auxiliary_loss_mlp": 0.01556771, + "balance_loss_clip": 1.53156173, + "balance_loss_mlp": 1.19227719, + "epoch": 0.005531172969398184, + "flos": 21107079904320.0, + "grad_norm": 2.072620870003065, + "language_loss": 1.04219341, + "learning_rate": 2.7736437536690466e-06, + "loss": 1.07650542, + "num_input_tokens_seen": 1067925, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 3.64453125, + "step": 46, + "time_per_iteration": 3.014451026916504 + }, + { + "auxiliary_loss_clip": 0.01860058, + "auxiliary_loss_mlp": 0.01534034, + "balance_loss_clip": 1.51546502, + "balance_loss_mlp": 1.16419911, + "epoch": 0.005651415860037276, + "flos": 20846700535680.0, + "grad_norm": 2.143353160765761, + "language_loss": 1.07909012, + "learning_rate": 2.789223836941131e-06, + "loss": 1.11303115, + "num_input_tokens_seen": 1088060, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 3.6953125, + "step": 47, + "time_per_iteration": 3.1531975269317627 + }, + { + "auxiliary_loss_clip": 0.01867659, + "auxiliary_loss_mlp": 0.01575646, + "balance_loss_clip": 1.52354991, + "balance_loss_mlp": 1.20447636, + "epoch": 0.005771658750676366, + "flos": 13261827176640.0, + "grad_norm": 2.3214895206889374, + "language_loss": 1.08775735, + "learning_rate": 2.8044758939680847e-06, + "loss": 1.12219048, + "num_input_tokens_seen": 1104130, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 3.7109375, + "step": 48, + "time_per_iteration": 3.094665288925171 + }, + { + "auxiliary_loss_clip": 0.01868481, + "auxiliary_loss_mlp": 0.01577504, + "balance_loss_clip": 1.52343845, + "balance_loss_mlp": 1.21663404, + "epoch": 0.005891901641315457, + "flos": 24427685537280.0, + "grad_norm": 2.7284235950710154, + "language_loss": 1.02160311, + "learning_rate": 2.8194134530738863e-06, + "loss": 1.05606294, + "num_input_tokens_seen": 1122900, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 3.609375, + "step": 49, + "time_per_iteration": 3.061480760574341 + }, + { + "auxiliary_loss_clip": 0.01869792, + "auxiliary_loss_mlp": 0.0158715, + "balance_loss_clip": 1.52630901, + "balance_loss_mlp": 1.22055769, + "epoch": 0.006012144531954548, + "flos": 23078453621760.0, + "grad_norm": 4.046297388329937, + "language_loss": 0.90297329, + "learning_rate": 2.834049222568994e-06, + "loss": 0.93754268, + "num_input_tokens_seen": 1140250, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 3.66210938, + "step": 50, + "time_per_iteration": 3.1564111709594727 + }, + { + "auxiliary_loss_clip": 0.01846689, + "auxiliary_loss_mlp": 0.01526004, + "balance_loss_clip": 1.50340843, + "balance_loss_mlp": 1.15407109, + "epoch": 0.006132387422593639, + "flos": 22530993095520.0, + "grad_norm": 3.0171525564121695, + "language_loss": 0.92592382, + "learning_rate": 2.848395155712969e-06, + "loss": 0.95965075, + "num_input_tokens_seen": 1160470, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 3.71484375, + "step": 51, + "time_per_iteration": 3.2092227935791016 + }, + { + "auxiliary_loss_clip": 0.01867617, + "auxiliary_loss_mlp": 0.01560398, + "balance_loss_clip": 1.52370429, + "balance_loss_mlp": 1.19323373, + "epoch": 0.00625263031323273, + "flos": 27630371427840.0, + "grad_norm": 2.7398071382672877, + "language_loss": 0.97629279, + "learning_rate": 2.8624625093687977e-06, + "loss": 1.01057291, + "num_input_tokens_seen": 1177605, + "router_z_loss_clip": 3.43554688, + "router_z_loss_mlp": 3.66796875, + "step": 52, + "time_per_iteration": 3.058035135269165 + }, + { + "auxiliary_loss_clip": 0.01857036, + "auxiliary_loss_mlp": 0.01557689, + "balance_loss_clip": 1.51166773, + "balance_loss_mlp": 1.18117809, + "epoch": 0.006372873203871821, + "flos": 23112816899040.0, + "grad_norm": 2.598098657165095, + "language_loss": 0.89194357, + "learning_rate": 2.876261897070029e-06, + "loss": 0.92609072, + "num_input_tokens_seen": 1197735, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 3.765625, + "step": 53, + "time_per_iteration": 3.191089391708374 + }, + { + "auxiliary_loss_clip": 0.01845305, + "auxiliary_loss_mlp": 0.0157223, + "balance_loss_clip": 1.50071609, + "balance_loss_mlp": 1.2052567, + "epoch": 0.006493116094510912, + "flos": 22858202610720.0, + "grad_norm": 2.6139907723815585, + "language_loss": 0.92523217, + "learning_rate": 2.889803337127447e-06, + "loss": 0.95940751, + "num_input_tokens_seen": 1216335, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 3.66796875, + "step": 54, + "time_per_iteration": 3.0551047325134277 + }, + { + "auxiliary_loss_clip": 0.01850702, + "auxiliary_loss_mlp": 0.01573554, + "balance_loss_clip": 1.50489473, + "balance_loss_mlp": 1.21134901, + "epoch": 0.006613358985150003, + "flos": 23073712601760.0, + "grad_norm": 3.5445984805567687, + "language_loss": 0.8462249, + "learning_rate": 2.903096296321516e-06, + "loss": 0.88046747, + "num_input_tokens_seen": 1234480, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 3.61914062, + "step": 55, + "time_per_iteration": 3.1293742656707764 + }, + { + "auxiliary_loss_clip": 0.0185489, + "auxiliary_loss_mlp": 0.01570198, + "balance_loss_clip": 1.51144528, + "balance_loss_mlp": 1.20627642, + "epoch": 0.006733601875789094, + "flos": 26539850041920.0, + "grad_norm": 2.3525978472986564, + "language_loss": 0.91580051, + "learning_rate": 2.9161497296578907e-06, + "loss": 0.95005143, + "num_input_tokens_seen": 1253870, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 3.63867188, + "step": 56, + "time_per_iteration": 3.081537961959839 + }, + { + "auxiliary_loss_clip": 0.0183602, + "auxiliary_loss_mlp": 0.01550431, + "balance_loss_clip": 1.49117112, + "balance_loss_mlp": 1.18116784, + "epoch": 0.006853844766428185, + "flos": 15524985143520.0, + "grad_norm": 2.41329540018121, + "language_loss": 0.85821247, + "learning_rate": 2.928972116604173e-06, + "loss": 0.89207697, + "num_input_tokens_seen": 1270145, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 3.69140625, + "step": 57, + "time_per_iteration": 2.9876959323883057 + }, + { + "auxiliary_loss_clip": 0.01848184, + "auxiliary_loss_mlp": 0.0155652, + "balance_loss_clip": 1.50383306, + "balance_loss_mlp": 1.18725777, + "epoch": 0.006974087657067276, + "flos": 24246083685600.0, + "grad_norm": 2.4354708693187934, + "language_loss": 1.02001262, + "learning_rate": 2.9415714941751377e-06, + "loss": 1.05405962, + "num_input_tokens_seen": 1291365, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 3.69140625, + "step": 58, + "time_per_iteration": 3.084993600845337 + }, + { + "auxiliary_loss_clip": 0.01833239, + "auxiliary_loss_mlp": 0.01575494, + "balance_loss_clip": 1.48810744, + "balance_loss_mlp": 1.21462369, + "epoch": 0.007094330547706367, + "flos": 25774717619520.0, + "grad_norm": 2.138209607189185, + "language_loss": 0.93531573, + "learning_rate": 2.9539554871897396e-06, + "loss": 0.96940315, + "num_input_tokens_seen": 1311535, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 3.609375, + "step": 59, + "time_per_iteration": 3.032778024673462 + }, + { + "auxiliary_loss_clip": 0.01832826, + "auxiliary_loss_mlp": 0.01575647, + "balance_loss_clip": 1.48852324, + "balance_loss_mlp": 1.21115303, + "epoch": 0.007214573438345458, + "flos": 21320845200000.0, + "grad_norm": 2.0631128760839887, + "language_loss": 0.97662562, + "learning_rate": 2.9661313359851253e-06, + "loss": 1.01071036, + "num_input_tokens_seen": 1329420, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 3.64453125, + "step": 60, + "time_per_iteration": 3.1293041706085205 + }, + { + "auxiliary_loss_clip": 0.01829886, + "auxiliary_loss_mlp": 0.01606197, + "balance_loss_clip": 1.48627329, + "balance_loss_mlp": 1.24208391, + "epoch": 0.007334816328984549, + "flos": 24939455300640.0, + "grad_norm": 2.2402406845628224, + "language_loss": 0.94014144, + "learning_rate": 2.978105921839922e-06, + "loss": 0.97450221, + "num_input_tokens_seen": 1349965, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 3.640625, + "step": 61, + "time_per_iteration": 3.140129327774048 + }, + { + "auxiliary_loss_clip": 0.0183279, + "auxiliary_loss_mlp": 0.01584532, + "balance_loss_clip": 1.4884479, + "balance_loss_mlp": 1.22022867, + "epoch": 0.00745505921962364, + "flos": 18512312755680.0, + "grad_norm": 2.3452860424755855, + "language_loss": 0.72144592, + "learning_rate": 2.9898857903302893e-06, + "loss": 0.75561917, + "num_input_tokens_seen": 1368915, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 3.640625, + "step": 62, + "time_per_iteration": 3.1047356128692627 + }, + { + "auxiliary_loss_clip": 0.01835024, + "auxiliary_loss_mlp": 0.01578951, + "balance_loss_clip": 1.49226689, + "balance_loss_mlp": 1.20568252, + "epoch": 0.007575302110262731, + "flos": 18479580389280.0, + "grad_norm": 3.1660912365945912, + "language_loss": 0.87945306, + "learning_rate": 3.001477172817253e-06, + "loss": 0.91359288, + "num_input_tokens_seen": 1386805, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 3.73242188, + "step": 63, + "time_per_iteration": 3.0152220726013184 + }, + { + "auxiliary_loss_clip": 0.01831201, + "auxiliary_loss_mlp": 0.01585531, + "balance_loss_clip": 1.48770332, + "balance_loss_mlp": 1.20863891, + "epoch": 0.007695545000901822, + "flos": 24975525345120.0, + "grad_norm": 2.8931844075477824, + "language_loss": 0.96453017, + "learning_rate": 3.012886006241894e-06, + "loss": 0.99869746, + "num_input_tokens_seen": 1406190, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 3.765625, + "step": 64, + "time_per_iteration": 3.1551103591918945 + }, + { + "auxiliary_loss_clip": 0.01818566, + "auxiliary_loss_mlp": 0.01563657, + "balance_loss_clip": 1.47492027, + "balance_loss_mlp": 1.19878089, + "epoch": 0.007815787891540913, + "flos": 21326344783200.0, + "grad_norm": 1.8324766117085787, + "language_loss": 0.88185596, + "learning_rate": 3.0241179513858383e-06, + "loss": 0.9156782, + "num_input_tokens_seen": 1425500, + "router_z_loss_clip": 3.43945312, + "router_z_loss_mlp": 3.64648438, + "step": 65, + "time_per_iteration": 2.9885659217834473 + }, + { + "auxiliary_loss_clip": 0.01809791, + "auxiliary_loss_mlp": 0.01579627, + "balance_loss_clip": 1.46566701, + "balance_loss_mlp": 1.21551442, + "epoch": 0.007936030782180003, + "flos": 21577697249760.0, + "grad_norm": 4.751979433858986, + "language_loss": 0.87777501, + "learning_rate": 3.035178409737647e-06, + "loss": 0.91166925, + "num_input_tokens_seen": 1442950, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 3.640625, + "step": 66, + "time_per_iteration": 3.0553717613220215 + }, + { + "auxiliary_loss_clip": 0.01810581, + "auxiliary_loss_mlp": 0.01591137, + "balance_loss_clip": 1.46575904, + "balance_loss_mlp": 1.21996737, + "epoch": 0.008056273672819095, + "flos": 20122872243840.0, + "grad_norm": 2.3542522172063007, + "language_loss": 0.88645911, + "learning_rate": 3.046072539090907e-06, + "loss": 0.92047632, + "num_input_tokens_seen": 1460915, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 3.71289062, + "step": 67, + "time_per_iteration": 3.0082168579101562 + }, + { + "auxiliary_loss_clip": 0.01808771, + "auxiliary_loss_mlp": 0.01634249, + "balance_loss_clip": 1.46531248, + "balance_loss_mlp": 1.27700233, + "epoch": 0.008176516563458186, + "flos": 18335110570560.0, + "grad_norm": 2.6820677017452828, + "language_loss": 1.04760993, + "learning_rate": 3.056805267986779e-06, + "loss": 1.08204019, + "num_input_tokens_seen": 1478385, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 3.5703125, + "step": 68, + "time_per_iteration": 3.113341808319092 + }, + { + "auxiliary_loss_clip": 0.01807789, + "auxiliary_loss_mlp": 0.01581953, + "balance_loss_clip": 1.46530533, + "balance_loss_mlp": 1.21116436, + "epoch": 0.008296759454097276, + "flos": 21874260447360.0, + "grad_norm": 2.5029935271977943, + "language_loss": 0.95334494, + "learning_rate": 3.0673813091022194e-06, + "loss": 0.98724234, + "num_input_tokens_seen": 1497605, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 3.70703125, + "step": 69, + "time_per_iteration": 3.0955963134765625 + }, + { + "auxiliary_loss_clip": 0.02130096, + "auxiliary_loss_mlp": 0.01675594, + "balance_loss_clip": 1.78269684, + "balance_loss_mlp": 1.36507797, + "epoch": 0.008417002344736368, + "flos": 63415643631840.0, + "grad_norm": 1.381901550815869, + "language_loss": 0.62103873, + "learning_rate": 3.0778051716749317e-06, + "loss": 0.65909564, + "num_input_tokens_seen": 1561150, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 3.1015625, + "step": 70, + "time_per_iteration": 3.850698471069336 + }, + { + "auxiliary_loss_clip": 0.01795839, + "auxiliary_loss_mlp": 0.01525026, + "balance_loss_clip": 1.45239401, + "balance_loss_mlp": 1.15786135, + "epoch": 0.008537245235375458, + "flos": 22968688433760.0, + "grad_norm": 2.1222425762370425, + "language_loss": 0.9019047, + "learning_rate": 3.0880811730470094e-06, + "loss": 0.93511331, + "num_input_tokens_seen": 1580605, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 3.66796875, + "step": 71, + "time_per_iteration": 4.037469387054443 + }, + { + "auxiliary_loss_clip": 0.02096294, + "auxiliary_loss_mlp": 0.01489567, + "balance_loss_clip": 1.74830186, + "balance_loss_mlp": 1.15997696, + "epoch": 0.008657488126014549, + "flos": 61991313230880.0, + "grad_norm": 1.1900711476632853, + "language_loss": 0.58652163, + "learning_rate": 3.098213449401257e-06, + "loss": 0.6223802, + "num_input_tokens_seen": 1647535, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 3.296875, + "step": 72, + "time_per_iteration": 6.050449848175049 + }, + { + "auxiliary_loss_clip": 0.01784926, + "auxiliary_loss_mlp": 0.01554725, + "balance_loss_clip": 1.44215655, + "balance_loss_mlp": 1.17859554, + "epoch": 0.00877773101665364, + "flos": 30299250929760.0, + "grad_norm": 6.003412438139474, + "language_loss": 0.98701495, + "learning_rate": 3.1082059657570015e-06, + "loss": 1.02041149, + "num_input_tokens_seen": 1666770, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 3.7578125, + "step": 73, + "time_per_iteration": 3.127108335494995 + }, + { + "auxiliary_loss_clip": 0.01788143, + "auxiliary_loss_mlp": 0.01591979, + "balance_loss_clip": 1.44465804, + "balance_loss_mlp": 1.21451485, + "epoch": 0.00889797390729273, + "flos": 23516338600800.0, + "grad_norm": 2.9001128809704086, + "language_loss": 0.96889949, + "learning_rate": 3.1180625252858496e-06, + "loss": 1.00270081, + "num_input_tokens_seen": 1685200, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 3.76953125, + "step": 74, + "time_per_iteration": 3.063570022583008 + }, + { + "auxiliary_loss_clip": 0.01793657, + "auxiliary_loss_mlp": 0.01543157, + "balance_loss_clip": 1.45066977, + "balance_loss_mlp": 1.15348589, + "epoch": 0.009018216797931822, + "flos": 23078074340160.0, + "grad_norm": 4.6484689795248855, + "language_loss": 0.7982834, + "learning_rate": 3.1277867780021663e-06, + "loss": 0.83165157, + "num_input_tokens_seen": 1701835, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 3.89453125, + "step": 75, + "time_per_iteration": 2.9752562046051025 + }, + { + "auxiliary_loss_clip": 0.01783606, + "auxiliary_loss_mlp": 0.01563336, + "balance_loss_clip": 1.44056427, + "balance_loss_mlp": 1.1740458, + "epoch": 0.009138459688570914, + "flos": 15920390219040.0, + "grad_norm": 2.4502992620976873, + "language_loss": 0.95731246, + "learning_rate": 3.1373822288779824e-06, + "loss": 0.99078184, + "num_input_tokens_seen": 1718415, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 3.890625, + "step": 76, + "time_per_iteration": 3.116710662841797 + }, + { + "auxiliary_loss_clip": 0.01771888, + "auxiliary_loss_mlp": 0.01615832, + "balance_loss_clip": 1.42701793, + "balance_loss_mlp": 1.24161005, + "epoch": 0.009258702579210003, + "flos": 27019001223360.0, + "grad_norm": 6.361629527680363, + "language_loss": 0.79523945, + "learning_rate": 3.1468522454274533e-06, + "loss": 0.82911658, + "num_input_tokens_seen": 1738770, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 3.7421875, + "step": 77, + "time_per_iteration": 3.085973024368286 + }, + { + "auxiliary_loss_clip": 0.01773012, + "auxiliary_loss_mlp": 0.01543396, + "balance_loss_clip": 1.43032646, + "balance_loss_mlp": 1.16783941, + "epoch": 0.009378945469849095, + "flos": 26905443219360.0, + "grad_norm": 4.230996501354527, + "language_loss": 0.91810048, + "learning_rate": 3.15620006480197e-06, + "loss": 0.95126462, + "num_input_tokens_seen": 1758040, + "router_z_loss_clip": 3.42382812, + "router_z_loss_mlp": 3.75195312, + "step": 78, + "time_per_iteration": 3.0526254177093506 + }, + { + "auxiliary_loss_clip": 0.01773809, + "auxiliary_loss_mlp": 0.01541954, + "balance_loss_clip": 1.42942166, + "balance_loss_mlp": 1.15419006, + "epoch": 0.009499188360488187, + "flos": 35696861298720.0, + "grad_norm": 3.2842612050443916, + "language_loss": 0.74814665, + "learning_rate": 3.1654288004333087e-06, + "loss": 0.78130424, + "num_input_tokens_seen": 1776705, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 3.875, + "step": 79, + "time_per_iteration": 3.128460168838501 + }, + { + "auxiliary_loss_clip": 0.01775723, + "auxiliary_loss_mlp": 0.01569534, + "balance_loss_clip": 1.43312597, + "balance_loss_mlp": 1.18081605, + "epoch": 0.009619431251127276, + "flos": 21505139951040.0, + "grad_norm": 3.0580119126934546, + "language_loss": 0.76385665, + "learning_rate": 3.1745414482589353e-06, + "loss": 0.79730916, + "num_input_tokens_seen": 1795915, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 3.88671875, + "step": 80, + "time_per_iteration": 3.033447265625 + }, + { + "auxiliary_loss_clip": 0.01774976, + "auxiliary_loss_mlp": 0.01555366, + "balance_loss_clip": 1.43157601, + "balance_loss_mlp": 1.16779268, + "epoch": 0.009739674141766368, + "flos": 17423005070880.0, + "grad_norm": 2.8450979640583367, + "language_loss": 0.87023687, + "learning_rate": 3.1835408925606204e-06, + "loss": 0.90354031, + "num_input_tokens_seen": 1814055, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 3.87109375, + "step": 81, + "time_per_iteration": 3.129096746444702 + }, + { + "auxiliary_loss_clip": 0.01769577, + "auxiliary_loss_mlp": 0.01563891, + "balance_loss_clip": 1.42567801, + "balance_loss_mlp": 1.1789881, + "epoch": 0.00985991703240546, + "flos": 27529595213760.0, + "grad_norm": 2.8711758606771784, + "language_loss": 0.89439368, + "learning_rate": 3.1924299114448214e-06, + "loss": 0.92772835, + "num_input_tokens_seen": 1834535, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 3.84570312, + "step": 82, + "time_per_iteration": 3.134516477584839 + }, + { + "auxiliary_loss_clip": 0.01764922, + "auxiliary_loss_mlp": 0.01562252, + "balance_loss_clip": 1.42041409, + "balance_loss_mlp": 1.17563272, + "epoch": 0.00998015992304455, + "flos": 13809780768960.0, + "grad_norm": 2.5936538048818547, + "language_loss": 0.8347342, + "learning_rate": 3.2012111819909055e-06, + "loss": 0.86800587, + "num_input_tokens_seen": 1851865, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 3.86328125, + "step": 83, + "time_per_iteration": 3.158609628677368 + }, + { + "auxiliary_loss_clip": 0.01763317, + "auxiliary_loss_mlp": 0.01592968, + "balance_loss_clip": 1.41922903, + "balance_loss_mlp": 1.20787442, + "epoch": 0.010100402813683641, + "flos": 20193343493760.0, + "grad_norm": 2.4621474529661946, + "language_loss": 0.95062292, + "learning_rate": 3.2098872850910627e-06, + "loss": 0.9841857, + "num_input_tokens_seen": 1868540, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 3.84960938, + "step": 84, + "time_per_iteration": 3.2642383575439453 + }, + { + "auxiliary_loss_clip": 0.0176837, + "auxiliary_loss_mlp": 0.01558416, + "balance_loss_clip": 1.42388296, + "balance_loss_mlp": 1.17770898, + "epoch": 0.010220645704322733, + "flos": 17203550551200.0, + "grad_norm": 2.688877167665246, + "language_loss": 0.89341301, + "learning_rate": 3.2184607100038194e-06, + "loss": 0.92668092, + "num_input_tokens_seen": 1887180, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 3.8046875, + "step": 85, + "time_per_iteration": 3.0804483890533447 + }, + { + "auxiliary_loss_clip": 0.01761772, + "auxiliary_loss_mlp": 0.01551001, + "balance_loss_clip": 1.41689336, + "balance_loss_mlp": 1.16914988, + "epoch": 0.010340888594961822, + "flos": 21472559297280.0, + "grad_norm": 2.2361862473078755, + "language_loss": 0.93223226, + "learning_rate": 3.2269338586412414e-06, + "loss": 0.96535993, + "num_input_tokens_seen": 1904765, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 3.81640625, + "step": 86, + "time_per_iteration": 3.0780580043792725 + }, + { + "auxiliary_loss_clip": 0.01759695, + "auxiliary_loss_mlp": 0.01530563, + "balance_loss_clip": 1.41706133, + "balance_loss_mlp": 1.15100098, + "epoch": 0.010461131485600914, + "flos": 23005061903520.0, + "grad_norm": 4.567950767407209, + "language_loss": 0.96254766, + "learning_rate": 3.2353090496083106e-06, + "loss": 0.9954502, + "num_input_tokens_seen": 1922600, + "router_z_loss_clip": 3.42382812, + "router_z_loss_mlp": 3.79492188, + "step": 87, + "time_per_iteration": 3.136861801147461 + }, + { + "auxiliary_loss_clip": 0.01755882, + "auxiliary_loss_mlp": 0.01564895, + "balance_loss_clip": 1.41100359, + "balance_loss_mlp": 1.18762136, + "epoch": 0.010581374376240005, + "flos": 33549650809920.0, + "grad_norm": 3.7953848781045956, + "language_loss": 0.81319594, + "learning_rate": 3.2435885220114572e-06, + "loss": 0.84640372, + "num_input_tokens_seen": 1943950, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 3.76757812, + "step": 88, + "time_per_iteration": 3.1035633087158203 + }, + { + "auxiliary_loss_clip": 0.01763278, + "auxiliary_loss_mlp": 0.0153656, + "balance_loss_clip": 1.41948366, + "balance_loss_mlp": 1.16081214, + "epoch": 0.010701617266879095, + "flos": 21765291750720.0, + "grad_norm": 2.064049448519584, + "language_loss": 0.94101083, + "learning_rate": 3.2517744390519113e-06, + "loss": 0.97400916, + "num_input_tokens_seen": 1962815, + "router_z_loss_clip": 3.43945312, + "router_z_loss_mlp": 3.75585938, + "step": 89, + "time_per_iteration": 3.1294260025024414 + }, + { + "auxiliary_loss_clip": 0.01753285, + "auxiliary_loss_mlp": 0.01537286, + "balance_loss_clip": 1.40916932, + "balance_loss_mlp": 1.17069364, + "epoch": 0.010821860157518187, + "flos": 19062086899680.0, + "grad_norm": 2.7909291699697407, + "language_loss": 0.74992311, + "learning_rate": 3.259868891418298e-06, + "loss": 0.78282881, + "num_input_tokens_seen": 1980580, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 3.6640625, + "step": 90, + "time_per_iteration": 3.0707850456237793 + }, + { + "auxiliary_loss_clip": 0.01769326, + "auxiliary_loss_mlp": 0.01550842, + "balance_loss_clip": 1.42512202, + "balance_loss_mlp": 1.18310571, + "epoch": 0.010942103048157278, + "flos": 25449745865760.0, + "grad_norm": 1.9878871138058933, + "language_loss": 0.85157663, + "learning_rate": 3.2678739004917757e-06, + "loss": 0.88477826, + "num_input_tokens_seen": 2000315, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 3.67382812, + "step": 91, + "time_per_iteration": 3.125560760498047 + }, + { + "auxiliary_loss_clip": 0.0176297, + "auxiliary_loss_mlp": 0.01576265, + "balance_loss_clip": 1.41761255, + "balance_loss_mlp": 1.20356917, + "epoch": 0.011062345938796368, + "flos": 27496597350240.0, + "grad_norm": 1.7993701657328318, + "language_loss": 0.92400742, + "learning_rate": 3.275791421376029e-06, + "loss": 0.95739979, + "num_input_tokens_seen": 2023760, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 3.72265625, + "step": 92, + "time_per_iteration": 3.093116521835327 + }, + { + "auxiliary_loss_clip": 0.01747433, + "auxiliary_loss_mlp": 0.01567953, + "balance_loss_clip": 1.40301323, + "balance_loss_mlp": 1.20097971, + "epoch": 0.01118258882943546, + "flos": 16073090448480.0, + "grad_norm": 2.3065196016067464, + "language_loss": 0.95889091, + "learning_rate": 3.2836233457634622e-06, + "loss": 0.99204487, + "num_input_tokens_seen": 2041895, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 3.66601562, + "step": 93, + "time_per_iteration": 3.0007717609405518 + }, + { + "auxiliary_loss_clip": 0.01740852, + "auxiliary_loss_mlp": 0.0157964, + "balance_loss_clip": 1.39462078, + "balance_loss_mlp": 1.22449172, + "epoch": 0.011302831720074551, + "flos": 20670749979840.0, + "grad_norm": 11.147224639080004, + "language_loss": 0.85333753, + "learning_rate": 3.2913715046481135e-06, + "loss": 0.88654238, + "num_input_tokens_seen": 2061640, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 3.55273438, + "step": 94, + "time_per_iteration": 2.986821174621582 + }, + { + "auxiliary_loss_clip": 0.01739579, + "auxiliary_loss_mlp": 0.01574373, + "balance_loss_clip": 1.39441276, + "balance_loss_mlp": 1.21502876, + "epoch": 0.011423074610713641, + "flos": 13073549968800.0, + "grad_norm": 2.550211240292382, + "language_loss": 0.88905191, + "learning_rate": 3.299037670895023e-06, + "loss": 0.92219144, + "num_input_tokens_seen": 2078255, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 3.59375, + "step": 95, + "time_per_iteration": 3.103145122528076 + }, + { + "auxiliary_loss_clip": 0.01747257, + "auxiliary_loss_mlp": 0.01553477, + "balance_loss_clip": 1.401142, + "balance_loss_mlp": 1.18707514, + "epoch": 0.011543317501352733, + "flos": 30338241442560.0, + "grad_norm": 2.283325613685743, + "language_loss": 0.80388916, + "learning_rate": 3.3066235616750667e-06, + "loss": 0.83689654, + "num_input_tokens_seen": 2099490, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 3.6640625, + "step": 96, + "time_per_iteration": 3.133484363555908 + }, + { + "auxiliary_loss_clip": 0.01746026, + "auxiliary_loss_mlp": 0.01560269, + "balance_loss_clip": 1.39883971, + "balance_loss_mlp": 1.19291401, + "epoch": 0.011663560391991824, + "flos": 15524757574560.0, + "grad_norm": 2.3665786792368286, + "language_loss": 0.92638069, + "learning_rate": 3.3141308407736276e-06, + "loss": 0.95944363, + "num_input_tokens_seen": 2116125, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 3.66992188, + "step": 97, + "time_per_iteration": 3.168583631515503 + }, + { + "auxiliary_loss_clip": 0.01736802, + "auxiliary_loss_mlp": 0.01577422, + "balance_loss_clip": 1.39180493, + "balance_loss_mlp": 1.21464443, + "epoch": 0.011783803282630914, + "flos": 19904214215520.0, + "grad_norm": 2.9007464676228834, + "language_loss": 0.86713493, + "learning_rate": 3.321561120780869e-06, + "loss": 0.90027714, + "num_input_tokens_seen": 2134835, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 3.62695312, + "step": 98, + "time_per_iteration": 4.0249669551849365 + }, + { + "auxiliary_loss_clip": 0.01745182, + "auxiliary_loss_mlp": 0.01534567, + "balance_loss_clip": 1.3988235, + "balance_loss_mlp": 1.17197978, + "epoch": 0.011904046173270006, + "flos": 22342564175040.0, + "grad_norm": 3.6216576957325253, + "language_loss": 1.01271796, + "learning_rate": 3.3289159651708192e-06, + "loss": 1.04551542, + "num_input_tokens_seen": 2152410, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 3.62695312, + "step": 99, + "time_per_iteration": 3.0180346965789795 + }, + { + "auxiliary_loss_clip": 0.01731426, + "auxiliary_loss_mlp": 0.01626263, + "balance_loss_clip": 1.38237906, + "balance_loss_mlp": 1.26138687, + "epoch": 0.012024289063909096, + "flos": 19102139400960.0, + "grad_norm": 2.5118645494213414, + "language_loss": 0.97696525, + "learning_rate": 3.3361968902759768e-06, + "loss": 1.01054215, + "num_input_tokens_seen": 2172090, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 3.6484375, + "step": 100, + "time_per_iteration": 4.133694171905518 + }, + { + "auxiliary_loss_clip": 0.01740285, + "auxiliary_loss_mlp": 0.01579713, + "balance_loss_clip": 1.393291, + "balance_loss_mlp": 1.22055876, + "epoch": 0.012144531954548187, + "flos": 15013973943360.0, + "grad_norm": 2.586692448376975, + "language_loss": 0.94177514, + "learning_rate": 3.343405367163663e-06, + "loss": 0.97497511, + "num_input_tokens_seen": 2189020, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 3.59179688, + "step": 101, + "time_per_iteration": 3.162930727005005 + }, + { + "auxiliary_loss_clip": 0.01747925, + "auxiliary_loss_mlp": 0.01611064, + "balance_loss_clip": 1.40172029, + "balance_loss_mlp": 1.2454257, + "epoch": 0.012264774845187279, + "flos": 15123814987680.0, + "grad_norm": 5.742457293012863, + "language_loss": 0.81407964, + "learning_rate": 3.350542823419951e-06, + "loss": 0.8476696, + "num_input_tokens_seen": 2205620, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 3.65625, + "step": 102, + "time_per_iteration": 3.1106016635894775 + }, + { + "auxiliary_loss_clip": 0.01735156, + "auxiliary_loss_mlp": 0.01613938, + "balance_loss_clip": 1.38662553, + "balance_loss_mlp": 1.26355767, + "epoch": 0.012385017735826368, + "flos": 13950685340640.0, + "grad_norm": 17.641729918309238, + "language_loss": 0.87754929, + "learning_rate": 3.3576106448465615e-06, + "loss": 0.91104019, + "num_input_tokens_seen": 2219000, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 3.50585938, + "step": 103, + "time_per_iteration": 3.0466113090515137 + }, + { + "auxiliary_loss_clip": 0.01727892, + "auxiliary_loss_mlp": 0.01577234, + "balance_loss_clip": 1.38007343, + "balance_loss_mlp": 1.22876191, + "epoch": 0.01250526062646546, + "flos": 23625534866400.0, + "grad_norm": 9.045563718776819, + "language_loss": 0.88605815, + "learning_rate": 3.3646101770757797e-06, + "loss": 0.9191094, + "num_input_tokens_seen": 2237790, + "router_z_loss_clip": 3.47460938, + "router_z_loss_mlp": 3.48632812, + "step": 104, + "time_per_iteration": 3.1296300888061523 + }, + { + "auxiliary_loss_clip": 0.01739433, + "auxiliary_loss_mlp": 0.01563657, + "balance_loss_clip": 1.3932265, + "balance_loss_mlp": 1.21461165, + "epoch": 0.012625503517104552, + "flos": 34642865095200.0, + "grad_norm": 3.4356491864512173, + "language_loss": 0.85892332, + "learning_rate": 3.371542727108104e-06, + "loss": 0.89195418, + "num_input_tokens_seen": 2259965, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 3.49023438, + "step": 105, + "time_per_iteration": 3.261150598526001 + }, + { + "auxiliary_loss_clip": 0.01731906, + "auxiliary_loss_mlp": 0.01613278, + "balance_loss_clip": 1.38312936, + "balance_loss_mlp": 1.27491474, + "epoch": 0.012745746407743641, + "flos": 17823833873280.0, + "grad_norm": 3.8548560506789906, + "language_loss": 0.90041322, + "learning_rate": 3.3784095647770114e-06, + "loss": 0.93386507, + "num_input_tokens_seen": 2278610, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 3.38671875, + "step": 106, + "time_per_iteration": 3.151303291320801 + }, + { + "auxiliary_loss_clip": 0.01729316, + "auxiliary_loss_mlp": 0.01563109, + "balance_loss_clip": 1.38153362, + "balance_loss_mlp": 1.20471787, + "epoch": 0.012865989298382733, + "flos": 20597168620800.0, + "grad_norm": 3.488777714782657, + "language_loss": 0.88803184, + "learning_rate": 3.3852119241449547e-06, + "loss": 0.92095608, + "num_input_tokens_seen": 2297730, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 3.58398438, + "step": 107, + "time_per_iteration": 3.2474358081817627 + }, + { + "auxiliary_loss_clip": 0.0172885, + "auxiliary_loss_mlp": 0.01562391, + "balance_loss_clip": 1.38193333, + "balance_loss_mlp": 1.20113921, + "epoch": 0.012986232189021825, + "flos": 23951113470720.0, + "grad_norm": 6.529613383586424, + "language_loss": 0.96332431, + "learning_rate": 3.3919510048344295e-06, + "loss": 0.99623668, + "num_input_tokens_seen": 2315740, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 3.61328125, + "step": 108, + "time_per_iteration": 3.165900468826294 + }, + { + "auxiliary_loss_clip": 0.01732346, + "auxiliary_loss_mlp": 0.01576416, + "balance_loss_clip": 1.38484311, + "balance_loss_mlp": 1.21268511, + "epoch": 0.013106475079660914, + "flos": 23727297212640.0, + "grad_norm": 4.0576569000955125, + "language_loss": 0.8670541, + "learning_rate": 3.3986279732976907e-06, + "loss": 0.90014172, + "num_input_tokens_seen": 2334215, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 3.63476562, + "step": 109, + "time_per_iteration": 3.1235814094543457 + }, + { + "auxiliary_loss_clip": 0.01727862, + "auxiliary_loss_mlp": 0.01561061, + "balance_loss_clip": 1.38199127, + "balance_loss_mlp": 1.2013346, + "epoch": 0.013226717970300006, + "flos": 21104387004960.0, + "grad_norm": 13.332412101628025, + "language_loss": 0.95681828, + "learning_rate": 3.4052439640284983e-06, + "loss": 0.98970759, + "num_input_tokens_seen": 2353130, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 3.59570312, + "step": 110, + "time_per_iteration": 3.2206482887268066 + }, + { + "auxiliary_loss_clip": 0.0173027, + "auxiliary_loss_mlp": 0.01559993, + "balance_loss_clip": 1.3844974, + "balance_loss_mlp": 1.20255542, + "epoch": 0.013346960860939098, + "flos": 24866253223200.0, + "grad_norm": 3.2348701415455796, + "language_loss": 0.81300759, + "learning_rate": 3.4118000807190217e-06, + "loss": 0.84591019, + "num_input_tokens_seen": 2374010, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 3.57617188, + "step": 111, + "time_per_iteration": 3.069685697555542 + }, + { + "auxiliary_loss_clip": 0.0171973, + "auxiliary_loss_mlp": 0.01561221, + "balance_loss_clip": 1.37356937, + "balance_loss_mlp": 1.19863379, + "epoch": 0.013467203751578187, + "flos": 28184355597600.0, + "grad_norm": 3.264198224256694, + "language_loss": 0.76270407, + "learning_rate": 3.4182973973648723e-06, + "loss": 0.79551363, + "num_input_tokens_seen": 2395220, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 3.62304688, + "step": 112, + "time_per_iteration": 3.1915647983551025 + }, + { + "auxiliary_loss_clip": 0.01731884, + "auxiliary_loss_mlp": 0.01614642, + "balance_loss_clip": 1.38457108, + "balance_loss_mlp": 1.26254499, + "epoch": 0.013587446642217279, + "flos": 18918223931520.0, + "grad_norm": 5.886165765358109, + "language_loss": 0.94822752, + "learning_rate": 3.424736959321014e-06, + "loss": 0.98169279, + "num_input_tokens_seen": 2413025, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 3.51953125, + "step": 113, + "time_per_iteration": 2.9772427082061768 + }, + { + "auxiliary_loss_clip": 0.01728343, + "auxiliary_loss_mlp": 0.01556723, + "balance_loss_clip": 1.38117707, + "balance_loss_mlp": 1.19775987, + "epoch": 0.01370768953285637, + "flos": 23990938403040.0, + "grad_norm": 3.043687473304829, + "language_loss": 0.88967931, + "learning_rate": 3.431119784311155e-06, + "loss": 0.92252994, + "num_input_tokens_seen": 2432700, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 3.58984375, + "step": 114, + "time_per_iteration": 3.0406582355499268 + }, + { + "auxiliary_loss_clip": 0.01728865, + "auxiliary_loss_mlp": 0.01628035, + "balance_loss_clip": 1.38297224, + "balance_loss_mlp": 1.26296806, + "epoch": 0.01382793242349546, + "flos": 39205706211360.0, + "grad_norm": 6.147559259954397, + "language_loss": 0.7784363, + "learning_rate": 3.43744686339307e-06, + "loss": 0.81200528, + "num_input_tokens_seen": 2455020, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 3.6484375, + "step": 115, + "time_per_iteration": 3.1606664657592773 + }, + { + "auxiliary_loss_clip": 0.01717169, + "auxiliary_loss_mlp": 0.01529405, + "balance_loss_clip": 1.36975622, + "balance_loss_mlp": 1.16300368, + "epoch": 0.013948175314134552, + "flos": 41356671588000.0, + "grad_norm": 3.4400747269340313, + "language_loss": 0.9100033, + "learning_rate": 3.44371916188212e-06, + "loss": 0.942469, + "num_input_tokens_seen": 2475775, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 3.6640625, + "step": 116, + "time_per_iteration": 3.1126012802124023 + }, + { + "auxiliary_loss_clip": 0.01720613, + "auxiliary_loss_mlp": 0.01565867, + "balance_loss_clip": 1.37253928, + "balance_loss_mlp": 1.20671296, + "epoch": 0.014068418204773643, + "flos": 22455629112960.0, + "grad_norm": 3.192607282179131, + "language_loss": 0.86551422, + "learning_rate": 3.449937620235143e-06, + "loss": 0.89837897, + "num_input_tokens_seen": 2496370, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 3.58984375, + "step": 117, + "time_per_iteration": 3.075941801071167 + }, + { + "auxiliary_loss_clip": 0.01723926, + "auxiliary_loss_mlp": 0.01602236, + "balance_loss_clip": 1.37756777, + "balance_loss_mlp": 1.24022126, + "epoch": 0.014188661095412733, + "flos": 23807402215200.0, + "grad_norm": 4.121731581289241, + "language_loss": 0.89470267, + "learning_rate": 3.456103154896722e-06, + "loss": 0.92796433, + "num_input_tokens_seen": 2517645, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 3.61914062, + "step": 118, + "time_per_iteration": 3.0476701259613037 + }, + { + "auxiliary_loss_clip": 0.01725379, + "auxiliary_loss_mlp": 0.01560165, + "balance_loss_clip": 1.3784343, + "balance_loss_mlp": 1.19109297, + "epoch": 0.014308903986051825, + "flos": 23662553114880.0, + "grad_norm": 12.367232960725993, + "language_loss": 0.92693597, + "learning_rate": 3.462216659109757e-06, + "loss": 0.95979136, + "num_input_tokens_seen": 2537825, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 3.68945312, + "step": 119, + "time_per_iteration": 3.036933660507202 + }, + { + "auxiliary_loss_clip": 0.01744113, + "auxiliary_loss_mlp": 0.01586913, + "balance_loss_clip": 1.39734602, + "balance_loss_mlp": 1.23481631, + "epoch": 0.014429146876690916, + "flos": 20670105201120.0, + "grad_norm": 3.7826479983357943, + "language_loss": 0.85270786, + "learning_rate": 3.4682790036921077e-06, + "loss": 0.88601804, + "num_input_tokens_seen": 2556485, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 3.5234375, + "step": 120, + "time_per_iteration": 3.2566027641296387 + }, + { + "auxiliary_loss_clip": 0.01731232, + "auxiliary_loss_mlp": 0.01573809, + "balance_loss_clip": 1.38581252, + "balance_loss_mlp": 1.2117945, + "epoch": 0.014549389767330006, + "flos": 20231234089920.0, + "grad_norm": 2.4453202520941244, + "language_loss": 0.8357988, + "learning_rate": 3.4742910377810193e-06, + "loss": 0.86884922, + "num_input_tokens_seen": 2573945, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 3.61914062, + "step": 121, + "time_per_iteration": 3.014360189437866 + }, + { + "auxiliary_loss_clip": 0.01731825, + "auxiliary_loss_mlp": 0.01614696, + "balance_loss_clip": 1.38540995, + "balance_loss_mlp": 1.259166, + "epoch": 0.014669632657969098, + "flos": 18006118431840.0, + "grad_norm": 2.7929291939927, + "language_loss": 0.89040238, + "learning_rate": 3.4802535895469042e-06, + "loss": 0.92386758, + "num_input_tokens_seen": 2592695, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 3.55664062, + "step": 122, + "time_per_iteration": 3.1919808387756348 + }, + { + "auxiliary_loss_clip": 0.01721949, + "auxiliary_loss_mlp": 0.01532539, + "balance_loss_clip": 1.37508166, + "balance_loss_mlp": 1.16670978, + "epoch": 0.01478987554860819, + "flos": 22743696402720.0, + "grad_norm": 2.6678306766237228, + "language_loss": 0.89773822, + "learning_rate": 3.4861674668779934e-06, + "loss": 0.93028307, + "num_input_tokens_seen": 2610925, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 3.65625, + "step": 123, + "time_per_iteration": 3.108888626098633 + }, + { + "auxiliary_loss_clip": 0.01721669, + "auxiliary_loss_mlp": 0.01545029, + "balance_loss_clip": 1.3764956, + "balance_loss_mlp": 1.18568408, + "epoch": 0.01491011843924728, + "flos": 17200326657600.0, + "grad_norm": 2.7076589382825627, + "language_loss": 0.84260082, + "learning_rate": 3.492033458037272e-06, + "loss": 0.8752678, + "num_input_tokens_seen": 2629495, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 3.59179688, + "step": 124, + "time_per_iteration": 3.158270835876465 + }, + { + "auxiliary_loss_clip": 0.01712933, + "auxiliary_loss_mlp": 0.01546376, + "balance_loss_clip": 1.36503315, + "balance_loss_mlp": 1.17787671, + "epoch": 0.01503036132988637, + "flos": 17675685023040.0, + "grad_norm": 4.073894159898651, + "language_loss": 0.87138015, + "learning_rate": 3.497852332293018e-06, + "loss": 0.90397322, + "num_input_tokens_seen": 2645070, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 3.68359375, + "step": 125, + "time_per_iteration": 4.1009886264801025 + }, + { + "auxiliary_loss_clip": 0.01727388, + "auxiliary_loss_mlp": 0.0155146, + "balance_loss_clip": 1.38056469, + "balance_loss_mlp": 1.18925476, + "epoch": 0.015150604220525462, + "flos": 18880143694560.0, + "grad_norm": 2.761559148818842, + "language_loss": 0.96868908, + "learning_rate": 3.5036248405242356e-06, + "loss": 1.00147748, + "num_input_tokens_seen": 2663825, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 3.62304688, + "step": 126, + "time_per_iteration": 3.1386609077453613 + }, + { + "auxiliary_loss_clip": 0.01729485, + "auxiliary_loss_mlp": 0.01543626, + "balance_loss_clip": 1.38356638, + "balance_loss_mlp": 1.17016697, + "epoch": 0.015270847111164552, + "flos": 39423264323040.0, + "grad_norm": 2.46016974245306, + "language_loss": 0.82802433, + "learning_rate": 3.509351715802146e-06, + "loss": 0.86075544, + "num_input_tokens_seen": 2684710, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 3.73046875, + "step": 127, + "time_per_iteration": 4.700181007385254 + }, + { + "auxiliary_loss_clip": 0.01721828, + "auxiliary_loss_mlp": 0.01581205, + "balance_loss_clip": 1.37537038, + "balance_loss_mlp": 1.22510362, + "epoch": 0.015391090001803644, + "flos": 43765816500000.0, + "grad_norm": 7.834708961369719, + "language_loss": 0.78680623, + "learning_rate": 3.5150336739488763e-06, + "loss": 0.81983662, + "num_input_tokens_seen": 2706995, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 3.5625, + "step": 128, + "time_per_iteration": 3.194401264190674 + }, + { + "auxiliary_loss_clip": 0.01731123, + "auxiliary_loss_mlp": 0.01527981, + "balance_loss_clip": 1.38626194, + "balance_loss_mlp": 1.15604818, + "epoch": 0.015511332892442733, + "flos": 18918337716000.0, + "grad_norm": 2.4371156596972834, + "language_loss": 0.84331512, + "learning_rate": 3.5206714140744143e-06, + "loss": 0.87590623, + "num_input_tokens_seen": 2727050, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 3.71679688, + "step": 129, + "time_per_iteration": 3.18670916557312 + }, + { + "auxiliary_loss_clip": 0.01743151, + "auxiliary_loss_mlp": 0.01572003, + "balance_loss_clip": 1.39749336, + "balance_loss_mlp": 1.20426607, + "epoch": 0.015631575783081827, + "flos": 24537602437920.0, + "grad_norm": 4.445160289879986, + "language_loss": 0.87784147, + "learning_rate": 3.5262656190928208e-06, + "loss": 0.91099298, + "num_input_tokens_seen": 2745350, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 3.6796875, + "step": 130, + "time_per_iteration": 3.0751125812530518 + }, + { + "auxiliary_loss_clip": 0.01946277, + "auxiliary_loss_mlp": 0.01741997, + "balance_loss_clip": 1.60231149, + "balance_loss_mlp": 1.25981903, + "epoch": 0.015751818673720917, + "flos": 62334680506560.0, + "grad_norm": 1.169417088844552, + "language_loss": 0.71475393, + "learning_rate": 3.5318169562186737e-06, + "loss": 0.75163668, + "num_input_tokens_seen": 2814195, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 4.8125, + "step": 131, + "time_per_iteration": 3.5291693210601807 + }, + { + "auxiliary_loss_clip": 0.01739289, + "auxiliary_loss_mlp": 0.01641807, + "balance_loss_clip": 1.39490843, + "balance_loss_mlp": 1.28456104, + "epoch": 0.015872061564360006, + "flos": 23880831861600.0, + "grad_norm": 2.0304000346560493, + "language_loss": 0.82368612, + "learning_rate": 3.5373260774446292e-06, + "loss": 0.85749704, + "num_input_tokens_seen": 2834645, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 3.57226562, + "step": 132, + "time_per_iteration": 3.05599308013916 + }, + { + "auxiliary_loss_clip": 0.01725071, + "auxiliary_loss_mlp": 0.01557081, + "balance_loss_clip": 1.38051236, + "balance_loss_mlp": 1.18476653, + "epoch": 0.0159923044549991, + "flos": 23370162014880.0, + "grad_norm": 12.046907017525262, + "language_loss": 0.90558958, + "learning_rate": 3.542793620000961e-06, + "loss": 0.93841112, + "num_input_tokens_seen": 2854120, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 3.72070312, + "step": 133, + "time_per_iteration": 3.0258593559265137 + }, + { + "auxiliary_loss_clip": 0.01722626, + "auxiliary_loss_mlp": 0.01603489, + "balance_loss_clip": 1.37592614, + "balance_loss_mlp": 1.24586082, + "epoch": 0.01611254734563819, + "flos": 17860245271200.0, + "grad_norm": 4.1735459484840876, + "language_loss": 0.87234533, + "learning_rate": 3.5482202067978894e-06, + "loss": 0.90560645, + "num_input_tokens_seen": 2871330, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 3.57617188, + "step": 134, + "time_per_iteration": 3.0081253051757812 + }, + { + "auxiliary_loss_clip": 0.01728936, + "auxiliary_loss_mlp": 0.0154466, + "balance_loss_clip": 1.38253987, + "balance_loss_mlp": 1.18512475, + "epoch": 0.01623279023627728, + "flos": 20956617436320.0, + "grad_norm": 4.470349997089716, + "language_loss": 0.76430583, + "learning_rate": 3.553606446851471e-06, + "loss": 0.79704177, + "num_input_tokens_seen": 2888070, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 3.59375, + "step": 135, + "time_per_iteration": 2.9790596961975098 + }, + { + "auxiliary_loss_clip": 0.01722419, + "auxiliary_loss_mlp": 0.01579363, + "balance_loss_clip": 1.37609625, + "balance_loss_mlp": 1.21429694, + "epoch": 0.016353033126916373, + "flos": 15744250022400.0, + "grad_norm": 2.8592875463965357, + "language_loss": 0.83715272, + "learning_rate": 3.5589529356937613e-06, + "loss": 0.87017059, + "num_input_tokens_seen": 2906465, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 3.64648438, + "step": 136, + "time_per_iteration": 3.0272176265716553 + }, + { + "auxiliary_loss_clip": 0.01724174, + "auxiliary_loss_mlp": 0.01561869, + "balance_loss_clip": 1.37754714, + "balance_loss_mlp": 1.20252442, + "epoch": 0.016473276017555463, + "flos": 18809293163040.0, + "grad_norm": 2.036717975823697, + "language_loss": 0.77090442, + "learning_rate": 3.5642602557679627e-06, + "loss": 0.80376482, + "num_input_tokens_seen": 2924915, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 3.59375, + "step": 137, + "time_per_iteration": 3.030698299407959 + }, + { + "auxiliary_loss_clip": 0.01745993, + "auxiliary_loss_mlp": 0.01579751, + "balance_loss_clip": 1.40069258, + "balance_loss_mlp": 1.21411192, + "epoch": 0.016593518908194552, + "flos": 24354938597760.0, + "grad_norm": 2.5910282248431855, + "language_loss": 0.84202659, + "learning_rate": 3.569528976809202e-06, + "loss": 0.87528402, + "num_input_tokens_seen": 2942130, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 3.65820312, + "step": 138, + "time_per_iteration": 3.034301280975342 + }, + { + "auxiliary_loss_clip": 0.01720325, + "auxiliary_loss_mlp": 0.01559204, + "balance_loss_clip": 1.37426496, + "balance_loss_mlp": 1.20729792, + "epoch": 0.016713761798833646, + "flos": 22348329255360.0, + "grad_norm": 2.707247215263435, + "language_loss": 0.90300596, + "learning_rate": 3.5747596562115522e-06, + "loss": 0.93580127, + "num_input_tokens_seen": 2962745, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 3.51953125, + "step": 139, + "time_per_iteration": 3.123987913131714 + }, + { + "auxiliary_loss_clip": 0.01726551, + "auxiliary_loss_mlp": 0.01601577, + "balance_loss_clip": 1.38005805, + "balance_loss_mlp": 1.2599709, + "epoch": 0.016834004689472735, + "flos": 17823758016960.0, + "grad_norm": 3.761293205692898, + "language_loss": 0.91258609, + "learning_rate": 3.5799528393819138e-06, + "loss": 0.94586742, + "num_input_tokens_seen": 2981825, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 3.41796875, + "step": 140, + "time_per_iteration": 3.154268980026245 + }, + { + "auxiliary_loss_clip": 0.01720091, + "auxiliary_loss_mlp": 0.01622025, + "balance_loss_clip": 1.37421429, + "balance_loss_mlp": 1.2712636, + "epoch": 0.016954247580111825, + "flos": 20521463284800.0, + "grad_norm": 2.3637283638452256, + "language_loss": 0.88433111, + "learning_rate": 3.585109060081286e-06, + "loss": 0.91775227, + "num_input_tokens_seen": 3001625, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 3.50976562, + "step": 141, + "time_per_iteration": 3.1481375694274902 + }, + { + "auxiliary_loss_clip": 0.01726, + "auxiliary_loss_mlp": 0.01552397, + "balance_loss_clip": 1.38003492, + "balance_loss_mlp": 1.20239878, + "epoch": 0.017074490470750915, + "flos": 22090946211360.0, + "grad_norm": 1.8034065246450588, + "language_loss": 0.78722221, + "learning_rate": 3.590228840753992e-06, + "loss": 0.82000613, + "num_input_tokens_seen": 3022055, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 3.50390625, + "step": 142, + "time_per_iteration": 3.089620590209961 + }, + { + "auxiliary_loss_clip": 0.01719063, + "auxiliary_loss_mlp": 0.01591211, + "balance_loss_clip": 1.3731761, + "balance_loss_mlp": 1.23758829, + "epoch": 0.01719473336139001, + "flos": 15999091879680.0, + "grad_norm": 2.484568580454919, + "language_loss": 0.87459701, + "learning_rate": 3.5953126928453423e-06, + "loss": 0.9076997, + "num_input_tokens_seen": 3039605, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 3.54101562, + "step": 143, + "time_per_iteration": 3.1183016300201416 + }, + { + "auxiliary_loss_clip": 0.01726196, + "auxiliary_loss_mlp": 0.01582251, + "balance_loss_clip": 1.38020802, + "balance_loss_mlp": 1.22118998, + "epoch": 0.017314976252029098, + "flos": 22494240344160.0, + "grad_norm": 2.7717599893259273, + "language_loss": 0.80444402, + "learning_rate": 3.600361117108239e-06, + "loss": 0.83752853, + "num_input_tokens_seen": 3059405, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 3.61328125, + "step": 144, + "time_per_iteration": 3.1173739433288574 + }, + { + "auxiliary_loss_clip": 0.01721433, + "auxiliary_loss_mlp": 0.0156488, + "balance_loss_clip": 1.376544, + "balance_loss_mlp": 1.20229292, + "epoch": 0.017435219142668188, + "flos": 22020247392480.0, + "grad_norm": 4.450032804824168, + "language_loss": 0.97485232, + "learning_rate": 3.6053746038991616e-06, + "loss": 1.00771546, + "num_input_tokens_seen": 3078490, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 3.625, + "step": 145, + "time_per_iteration": 3.09932017326355 + }, + { + "auxiliary_loss_clip": 0.01903401, + "auxiliary_loss_mlp": 0.01602417, + "balance_loss_clip": 1.56062865, + "balance_loss_mlp": 1.24688721, + "epoch": 0.01755546203330728, + "flos": 72246545654400.0, + "grad_norm": 1.242021686883176, + "language_loss": 0.58458471, + "learning_rate": 3.6103536334639843e-06, + "loss": 0.61964291, + "num_input_tokens_seen": 3131755, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 3.5625, + "step": 146, + "time_per_iteration": 3.5243642330169678 + }, + { + "auxiliary_loss_clip": 0.01721136, + "auxiliary_loss_mlp": 0.01556228, + "balance_loss_clip": 1.37619066, + "balance_loss_mlp": 1.20107961, + "epoch": 0.01767570492394637, + "flos": 25339373827200.0, + "grad_norm": 3.273901101499213, + "language_loss": 0.85870171, + "learning_rate": 3.615298676214041e-06, + "loss": 0.89147538, + "num_input_tokens_seen": 3152035, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 3.55078125, + "step": 147, + "time_per_iteration": 2.9677045345306396 + }, + { + "auxiliary_loss_clip": 0.01712484, + "auxiliary_loss_mlp": 0.01565645, + "balance_loss_clip": 1.36414742, + "balance_loss_mlp": 1.21087837, + "epoch": 0.01779594781458546, + "flos": 20451295460160.0, + "grad_norm": 2.1714739505044522, + "language_loss": 0.88971442, + "learning_rate": 3.6202101929928317e-06, + "loss": 0.92249572, + "num_input_tokens_seen": 3170625, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 3.55273438, + "step": 148, + "time_per_iteration": 3.091620922088623 + }, + { + "auxiliary_loss_clip": 0.01715455, + "auxiliary_loss_mlp": 0.0157901, + "balance_loss_clip": 1.36884058, + "balance_loss_mlp": 1.22328925, + "epoch": 0.017916190705224554, + "flos": 16255488791520.0, + "grad_norm": 6.576416573310424, + "language_loss": 0.88922846, + "learning_rate": 3.6250886353337413e-06, + "loss": 0.92217314, + "num_input_tokens_seen": 3188155, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 3.55859375, + "step": 149, + "time_per_iteration": 3.044194459915161 + }, + { + "auxiliary_loss_clip": 0.01720052, + "auxiliary_loss_mlp": 0.01578606, + "balance_loss_clip": 1.37501633, + "balance_loss_mlp": 1.2272718, + "epoch": 0.018036433595863644, + "flos": 23332802412960.0, + "grad_norm": 2.029458641395882, + "language_loss": 0.86735904, + "learning_rate": 3.6299344457091488e-06, + "loss": 0.90034556, + "num_input_tokens_seen": 3209015, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 3.51367188, + "step": 150, + "time_per_iteration": 3.0669867992401123 + }, + { + "auxiliary_loss_clip": 0.01720731, + "auxiliary_loss_mlp": 0.015761, + "balance_loss_clip": 1.37428021, + "balance_loss_mlp": 1.23411274, + "epoch": 0.018156676486502734, + "flos": 18589990356000.0, + "grad_norm": 3.5589848204199894, + "language_loss": 0.94096732, + "learning_rate": 3.634748057771256e-06, + "loss": 0.9739356, + "num_input_tokens_seen": 3224955, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 3.421875, + "step": 151, + "time_per_iteration": 2.9954116344451904 + }, + { + "auxiliary_loss_clip": 0.01715191, + "auxiliary_loss_mlp": 0.01556039, + "balance_loss_clip": 1.36890256, + "balance_loss_mlp": 1.20394278, + "epoch": 0.018276919377141827, + "flos": 25451149207680.0, + "grad_norm": 1.7819806269381302, + "language_loss": 0.85996294, + "learning_rate": 3.639529896584965e-06, + "loss": 0.89267522, + "num_input_tokens_seen": 3246330, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 3.5234375, + "step": 152, + "time_per_iteration": 3.1089417934417725 + }, + { + "auxiliary_loss_clip": 0.01717216, + "auxiliary_loss_mlp": 0.01564354, + "balance_loss_clip": 1.37178075, + "balance_loss_mlp": 1.21988678, + "epoch": 0.018397162267780917, + "flos": 20049366741120.0, + "grad_norm": 3.1368316038093944, + "language_loss": 0.89599103, + "learning_rate": 3.6442803788531233e-06, + "loss": 0.92880672, + "num_input_tokens_seen": 3264290, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 3.44921875, + "step": 153, + "time_per_iteration": 5.465434312820435 + }, + { + "auxiliary_loss_clip": 0.01712917, + "auxiliary_loss_mlp": 0.01560469, + "balance_loss_clip": 1.36511385, + "balance_loss_mlp": 1.20093369, + "epoch": 0.018517405158420007, + "flos": 27567865091520.0, + "grad_norm": 2.554723027599848, + "language_loss": 0.96276665, + "learning_rate": 3.6489999131344357e-06, + "loss": 0.9955005, + "num_input_tokens_seen": 3287065, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 3.59375, + "step": 154, + "time_per_iteration": 3.9459035396575928 + }, + { + "auxiliary_loss_clip": 0.01717889, + "auxiliary_loss_mlp": 0.01568117, + "balance_loss_clip": 1.37263107, + "balance_loss_mlp": 1.20514846, + "epoch": 0.0186376480490591, + "flos": 19356070982400.0, + "grad_norm": 2.3083298201496962, + "language_loss": 0.90899366, + "learning_rate": 3.653688900054313e-06, + "loss": 0.94185364, + "num_input_tokens_seen": 3305595, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 3.62890625, + "step": 155, + "time_per_iteration": 3.110231876373291 + }, + { + "auxiliary_loss_clip": 0.0170577, + "auxiliary_loss_mlp": 0.015683, + "balance_loss_clip": 1.35833919, + "balance_loss_mlp": 1.21162605, + "epoch": 0.01875789093969819, + "flos": 26690274581760.0, + "grad_norm": 3.868507849995315, + "language_loss": 0.7607305, + "learning_rate": 3.6583477325089526e-06, + "loss": 0.79347128, + "num_input_tokens_seen": 3326135, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 3.56835938, + "step": 156, + "time_per_iteration": 3.0628037452697754 + }, + { + "auxiliary_loss_clip": 0.01718417, + "auxiliary_loss_mlp": 0.01605371, + "balance_loss_clip": 1.37105894, + "balance_loss_mlp": 1.2607137, + "epoch": 0.01887813383033728, + "flos": 24355317879360.0, + "grad_norm": 2.5472256471546597, + "language_loss": 1.0457294, + "learning_rate": 3.6629767958628916e-06, + "loss": 1.07896733, + "num_input_tokens_seen": 3343510, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 3.45117188, + "step": 157, + "time_per_iteration": 3.19812273979187 + }, + { + "auxiliary_loss_clip": 0.01711996, + "auxiliary_loss_mlp": 0.01570388, + "balance_loss_clip": 1.36391866, + "balance_loss_mlp": 1.21771908, + "epoch": 0.018998376720976373, + "flos": 14649291041760.0, + "grad_norm": 3.3525061470078543, + "language_loss": 0.85628116, + "learning_rate": 3.667576468140291e-06, + "loss": 0.88910496, + "num_input_tokens_seen": 3361325, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 3.52734375, + "step": 158, + "time_per_iteration": 3.093219757080078 + }, + { + "auxiliary_loss_clip": 0.01706494, + "auxiliary_loss_mlp": 0.01557213, + "balance_loss_clip": 1.3572104, + "balance_loss_mlp": 1.19500756, + "epoch": 0.019118619611615463, + "flos": 29307305924640.0, + "grad_norm": 3.5465901490390768, + "language_loss": 0.89099848, + "learning_rate": 3.672147120210184e-06, + "loss": 0.92363548, + "num_input_tokens_seen": 3377925, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 3.62304688, + "step": 159, + "time_per_iteration": 3.0344412326812744 + }, + { + "auxiliary_loss_clip": 0.01719781, + "auxiliary_loss_mlp": 0.01569376, + "balance_loss_clip": 1.37142873, + "balance_loss_mlp": 1.21460962, + "epoch": 0.019238862502254553, + "flos": 20888839085760.0, + "grad_norm": 2.26114417399053, + "language_loss": 0.86598575, + "learning_rate": 3.6766891159659177e-06, + "loss": 0.89887726, + "num_input_tokens_seen": 3396335, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 3.54882812, + "step": 160, + "time_per_iteration": 3.0252678394317627 + }, + { + "auxiliary_loss_clip": 0.01716339, + "auxiliary_loss_mlp": 0.0156122, + "balance_loss_clip": 1.36930966, + "balance_loss_mlp": 1.19043159, + "epoch": 0.019359105392893646, + "flos": 21362794109280.0, + "grad_norm": 2.9304078120467283, + "language_loss": 0.88233519, + "learning_rate": 3.6812028124990075e-06, + "loss": 0.91511077, + "num_input_tokens_seen": 3413605, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 3.70507812, + "step": 161, + "time_per_iteration": 2.9890854358673096 + }, + { + "auxiliary_loss_clip": 0.01714721, + "auxiliary_loss_mlp": 0.01570771, + "balance_loss_clip": 1.36795962, + "balance_loss_mlp": 1.20761204, + "epoch": 0.019479348283532736, + "flos": 16285262761440.0, + "grad_norm": 3.0925047332626194, + "language_loss": 0.81521893, + "learning_rate": 3.6856885602676016e-06, + "loss": 0.84807384, + "num_input_tokens_seen": 3429640, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 3.62890625, + "step": 162, + "time_per_iteration": 3.0826096534729004 + }, + { + "auxiliary_loss_clip": 0.01710465, + "auxiliary_loss_mlp": 0.01562254, + "balance_loss_clip": 1.36281347, + "balance_loss_mlp": 1.19699669, + "epoch": 0.019599591174171826, + "flos": 22093449469920.0, + "grad_norm": 2.23721415707667, + "language_loss": 0.9477011, + "learning_rate": 3.6901467032597733e-06, + "loss": 0.98042822, + "num_input_tokens_seen": 3448125, + "router_z_loss_clip": 3.47460938, + "router_z_loss_mlp": 3.6484375, + "step": 163, + "time_per_iteration": 3.174715042114258 + }, + { + "auxiliary_loss_clip": 0.0170621, + "auxiliary_loss_mlp": 0.01526027, + "balance_loss_clip": 1.3586328, + "balance_loss_mlp": 1.16019773, + "epoch": 0.01971983406481092, + "flos": 19611709331040.0, + "grad_norm": 2.3536752728871715, + "language_loss": 0.87300324, + "learning_rate": 3.694577579151804e-06, + "loss": 0.90532559, + "num_input_tokens_seen": 3466535, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 3.65820312, + "step": 164, + "time_per_iteration": 3.0007131099700928 + }, + { + "auxiliary_loss_clip": 0.01715449, + "auxiliary_loss_mlp": 0.01561186, + "balance_loss_clip": 1.36757684, + "balance_loss_mlp": 1.19535685, + "epoch": 0.01984007695545001, + "flos": 19101380837760.0, + "grad_norm": 2.7174556268299797, + "language_loss": 0.73900807, + "learning_rate": 3.6989815194616703e-06, + "loss": 0.77177441, + "num_input_tokens_seen": 3483730, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 3.65820312, + "step": 165, + "time_per_iteration": 3.0427916049957275 + }, + { + "auxiliary_loss_clip": 0.01710191, + "auxiliary_loss_mlp": 0.01565278, + "balance_loss_clip": 1.36154032, + "balance_loss_mlp": 1.1927731, + "epoch": 0.0199603198460891, + "flos": 20850417495360.0, + "grad_norm": 2.2922352924849885, + "language_loss": 0.79933763, + "learning_rate": 3.703358849697888e-06, + "loss": 0.83209229, + "num_input_tokens_seen": 3503640, + "router_z_loss_clip": 3.48828125, + "router_z_loss_mlp": 3.72265625, + "step": 166, + "time_per_iteration": 3.0308518409729004 + }, + { + "auxiliary_loss_clip": 0.01708025, + "auxiliary_loss_mlp": 0.01535308, + "balance_loss_clip": 1.35914683, + "balance_loss_mlp": 1.15154898, + "epoch": 0.020080562736728192, + "flos": 21872857105440.0, + "grad_norm": 1.9626761935224493, + "language_loss": 0.82886654, + "learning_rate": 3.7077098895038803e-06, + "loss": 0.86129987, + "num_input_tokens_seen": 3523010, + "router_z_loss_clip": 3.49023438, + "router_z_loss_mlp": 3.8359375, + "step": 167, + "time_per_iteration": 3.0823593139648438 + }, + { + "auxiliary_loss_clip": 0.0171128, + "auxiliary_loss_mlp": 0.01518995, + "balance_loss_clip": 1.36052334, + "balance_loss_mlp": 1.15526319, + "epoch": 0.020200805627367282, + "flos": 21690913900320.0, + "grad_norm": 2.4708138819706, + "language_loss": 0.96868563, + "learning_rate": 3.712034952798045e-06, + "loss": 1.00098836, + "num_input_tokens_seen": 3541125, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 3.63476562, + "step": 168, + "time_per_iteration": 3.1328794956207275 + }, + { + "auxiliary_loss_clip": 0.01705537, + "auxiliary_loss_mlp": 0.01553013, + "balance_loss_clip": 1.35471225, + "balance_loss_mlp": 1.188519, + "epoch": 0.02032104851800637, + "flos": 33545630424960.0, + "grad_norm": 2.2062048743946976, + "language_loss": 0.84725904, + "learning_rate": 3.7163343479096656e-06, + "loss": 0.87984455, + "num_input_tokens_seen": 3562700, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 3.64453125, + "step": 169, + "time_per_iteration": 3.1022777557373047 + }, + { + "auxiliary_loss_clip": 0.01704198, + "auxiliary_loss_mlp": 0.0152245, + "balance_loss_clip": 1.35542512, + "balance_loss_mlp": 1.1581465, + "epoch": 0.020441291408645465, + "flos": 31689711119520.0, + "grad_norm": 2.301468905005771, + "language_loss": 0.8316747, + "learning_rate": 3.720608377710802e-06, + "loss": 0.86394113, + "num_input_tokens_seen": 3582790, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 3.640625, + "step": 170, + "time_per_iteration": 3.0955240726470947 + }, + { + "auxiliary_loss_clip": 0.01700517, + "auxiliary_loss_mlp": 0.01559965, + "balance_loss_clip": 1.35064924, + "balance_loss_mlp": 1.19222856, + "epoch": 0.020561534299284555, + "flos": 20888801157600.0, + "grad_norm": 3.7561398760049918, + "language_loss": 0.86685538, + "learning_rate": 3.7248573397443277e-06, + "loss": 0.8994602, + "num_input_tokens_seen": 3601715, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 3.67578125, + "step": 171, + "time_per_iteration": 3.028334617614746 + }, + { + "auxiliary_loss_clip": 0.0170671, + "auxiliary_loss_mlp": 0.01531599, + "balance_loss_clip": 1.35655463, + "balance_loss_mlp": 1.15718615, + "epoch": 0.020681777189923645, + "flos": 20998755986400.0, + "grad_norm": 2.1122586128059897, + "language_loss": 0.97706497, + "learning_rate": 3.729081526348224e-06, + "loss": 1.00944805, + "num_input_tokens_seen": 3620245, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 3.7421875, + "step": 172, + "time_per_iteration": 2.971095085144043 + }, + { + "auxiliary_loss_clip": 0.01706092, + "auxiliary_loss_mlp": 0.0151953, + "balance_loss_clip": 1.35571909, + "balance_loss_mlp": 1.15713406, + "epoch": 0.020802020080562738, + "flos": 28260098861760.0, + "grad_norm": 9.237325760996733, + "language_loss": 0.85160649, + "learning_rate": 3.7332812247762777e-06, + "loss": 0.88386273, + "num_input_tokens_seen": 3641545, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 3.625, + "step": 173, + "time_per_iteration": 3.121901035308838 + }, + { + "auxiliary_loss_clip": 0.01715518, + "auxiliary_loss_mlp": 0.01549885, + "balance_loss_clip": 1.36664104, + "balance_loss_mlp": 1.18195724, + "epoch": 0.020922262971201828, + "flos": 19683242569440.0, + "grad_norm": 3.155971151738191, + "language_loss": 0.95913732, + "learning_rate": 3.737456717315293e-06, + "loss": 0.99179137, + "num_input_tokens_seen": 3660510, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 3.67578125, + "step": 174, + "time_per_iteration": 3.1513516902923584 + }, + { + "auxiliary_loss_clip": 0.01715339, + "auxiliary_loss_mlp": 0.01600691, + "balance_loss_clip": 1.36619902, + "balance_loss_mlp": 1.2293303, + "epoch": 0.021042505861840918, + "flos": 15668013692160.0, + "grad_norm": 1.9347303427143183, + "language_loss": 0.90789115, + "learning_rate": 3.7416082813989552e-06, + "loss": 0.94105142, + "num_input_tokens_seen": 3677505, + "router_z_loss_clip": 3.49023438, + "router_z_loss_mlp": 3.7109375, + "step": 175, + "time_per_iteration": 3.1367955207824707 + }, + { + "auxiliary_loss_clip": 0.01709997, + "auxiliary_loss_mlp": 0.01574922, + "balance_loss_clip": 1.35848856, + "balance_loss_mlp": 1.20546865, + "epoch": 0.02116274875248001, + "flos": 21144136080960.0, + "grad_norm": 4.51260285688184, + "language_loss": 0.89894712, + "learning_rate": 3.745736189718439e-06, + "loss": 0.93179631, + "num_input_tokens_seen": 3696760, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 3.6953125, + "step": 176, + "time_per_iteration": 3.118285655975342 + }, + { + "auxiliary_loss_clip": 0.017082, + "auxiliary_loss_mlp": 0.01550659, + "balance_loss_clip": 1.35863805, + "balance_loss_mlp": 1.17929804, + "epoch": 0.0212829916431191, + "flos": 24717990588480.0, + "grad_norm": 4.184445364050825, + "language_loss": 0.72901183, + "learning_rate": 3.749840710329894e-06, + "loss": 0.76160049, + "num_input_tokens_seen": 3717465, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 3.7109375, + "step": 177, + "time_per_iteration": 3.017211675643921 + }, + { + "auxiliary_loss_clip": 0.01706114, + "auxiliary_loss_mlp": 0.01562084, + "balance_loss_clip": 1.35393667, + "balance_loss_mlp": 1.19358468, + "epoch": 0.02140323453375819, + "flos": 16646873482080.0, + "grad_norm": 3.700812388155272, + "language_loss": 0.98791575, + "learning_rate": 3.7539221067588938e-06, + "loss": 1.0205977, + "num_input_tokens_seen": 3731440, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 3.68359375, + "step": 178, + "time_per_iteration": 3.1375796794891357 + }, + { + "auxiliary_loss_clip": 0.01702818, + "auxiliary_loss_mlp": 0.01535842, + "balance_loss_clip": 1.3525672, + "balance_loss_mlp": 1.16295528, + "epoch": 0.021523477424397284, + "flos": 20301212273760.0, + "grad_norm": 3.8095936825635826, + "language_loss": 0.94314814, + "learning_rate": 3.757980638101964e-06, + "loss": 0.97553474, + "num_input_tokens_seen": 3744935, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 3.72851562, + "step": 179, + "time_per_iteration": 3.0374627113342285 + }, + { + "auxiliary_loss_clip": 0.01707491, + "auxiliary_loss_mlp": 0.01542778, + "balance_loss_clip": 1.35667634, + "balance_loss_mlp": 1.17847443, + "epoch": 0.021643720315036374, + "flos": 26106402657600.0, + "grad_norm": 2.361031316531969, + "language_loss": 0.89819503, + "learning_rate": 3.7620165591252806e-06, + "loss": 0.9306978, + "num_input_tokens_seen": 3763035, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 3.64648438, + "step": 180, + "time_per_iteration": 3.8335883617401123 + }, + { + "auxiliary_loss_clip": 0.01706491, + "auxiliary_loss_mlp": 0.0154619, + "balance_loss_clip": 1.35622406, + "balance_loss_mlp": 1.17292202, + "epoch": 0.021763963205675464, + "flos": 24789827252160.0, + "grad_norm": 2.0350285976562383, + "language_loss": 0.94293296, + "learning_rate": 3.766030120360636e-06, + "loss": 0.97545975, + "num_input_tokens_seen": 3782665, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 3.73046875, + "step": 181, + "time_per_iteration": 5.694573402404785 + }, + { + "auxiliary_loss_clip": 0.01700224, + "auxiliary_loss_mlp": 0.0157492, + "balance_loss_clip": 1.35096359, + "balance_loss_mlp": 1.21996236, + "epoch": 0.021884206096314557, + "flos": 25816021750080.0, + "grad_norm": 2.637800010580008, + "language_loss": 0.9033286, + "learning_rate": 3.7700215681987578e-06, + "loss": 0.93607998, + "num_input_tokens_seen": 3802435, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 3.55078125, + "step": 182, + "time_per_iteration": 3.1818203926086426 + }, + { + "auxiliary_loss_clip": 0.01699428, + "auxiliary_loss_mlp": 0.01566391, + "balance_loss_clip": 1.34796178, + "balance_loss_mlp": 1.1969372, + "epoch": 0.022004448986953647, + "flos": 20084716150560.0, + "grad_norm": 2.754747684014809, + "language_loss": 0.82737935, + "learning_rate": 3.7739911449800767e-06, + "loss": 0.86003757, + "num_input_tokens_seen": 3822490, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 3.69140625, + "step": 183, + "time_per_iteration": 3.048610210418701 + }, + { + "auxiliary_loss_clip": 0.01701243, + "auxiliary_loss_mlp": 0.01553503, + "balance_loss_clip": 1.35186136, + "balance_loss_mlp": 1.19625664, + "epoch": 0.022124691877592736, + "flos": 20482434843840.0, + "grad_norm": 2.2591388620855892, + "language_loss": 0.81005317, + "learning_rate": 3.7779390890830114e-06, + "loss": 0.84260058, + "num_input_tokens_seen": 3841140, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 3.57421875, + "step": 184, + "time_per_iteration": 2.9710264205932617 + }, + { + "auxiliary_loss_clip": 0.01695547, + "auxiliary_loss_mlp": 0.01540517, + "balance_loss_clip": 1.3441484, + "balance_loss_mlp": 1.17106366, + "epoch": 0.02224493476823183, + "flos": 23589123468480.0, + "grad_norm": 2.3027590177353234, + "language_loss": 0.86329585, + "learning_rate": 3.7818656350098723e-06, + "loss": 0.89565647, + "num_input_tokens_seen": 3862090, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 3.69140625, + "step": 185, + "time_per_iteration": 3.0953311920166016 + }, + { + "auxiliary_loss_clip": 0.01697252, + "auxiliary_loss_mlp": 0.01549597, + "balance_loss_clip": 1.34741926, + "balance_loss_mlp": 1.17728293, + "epoch": 0.02236517765887092, + "flos": 16911842158080.0, + "grad_norm": 2.4522822973553238, + "language_loss": 0.77345973, + "learning_rate": 3.7857710134704447e-06, + "loss": 0.80592823, + "num_input_tokens_seen": 3881025, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 3.71875, + "step": 186, + "time_per_iteration": 3.0076634883880615 + }, + { + "auxiliary_loss_clip": 0.01707777, + "auxiliary_loss_mlp": 0.01537551, + "balance_loss_clip": 1.35851669, + "balance_loss_mlp": 1.17343831, + "epoch": 0.02248542054951001, + "flos": 43511581493280.0, + "grad_norm": 2.5437075697434564, + "language_loss": 0.79534876, + "learning_rate": 3.7896554514633234e-06, + "loss": 0.82780206, + "num_input_tokens_seen": 3905310, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 3.63867188, + "step": 187, + "time_per_iteration": 3.142277717590332 + }, + { + "auxiliary_loss_clip": 0.01694762, + "auxiliary_loss_mlp": 0.01528278, + "balance_loss_clip": 1.34220076, + "balance_loss_mlp": 1.16092241, + "epoch": 0.022605663440149103, + "flos": 23369896517760.0, + "grad_norm": 2.2981739757663053, + "language_loss": 0.84617352, + "learning_rate": 3.7935191723550955e-06, + "loss": 0.8784039, + "num_input_tokens_seen": 3924265, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 3.671875, + "step": 188, + "time_per_iteration": 2.9982504844665527 + }, + { + "auxiliary_loss_clip": 0.01698636, + "auxiliary_loss_mlp": 0.01538521, + "balance_loss_clip": 1.34921694, + "balance_loss_mlp": 1.16296422, + "epoch": 0.022725906330788193, + "flos": 29022310815840.0, + "grad_norm": 2.3502383572407934, + "language_loss": 0.88893026, + "learning_rate": 3.797362395957408e-06, + "loss": 0.92130184, + "num_input_tokens_seen": 3944830, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 3.75390625, + "step": 189, + "time_per_iteration": 3.0127041339874268 + }, + { + "auxiliary_loss_clip": 0.01704774, + "auxiliary_loss_mlp": 0.01564725, + "balance_loss_clip": 1.35587609, + "balance_loss_mlp": 1.20366359, + "epoch": 0.022846149221427282, + "flos": 24498346428000.0, + "grad_norm": 2.9736942080060804, + "language_loss": 0.7861039, + "learning_rate": 3.8011853386020055e-06, + "loss": 0.8187989, + "num_input_tokens_seen": 3965735, + "router_z_loss_clip": 3.49023438, + "router_z_loss_mlp": 3.609375, + "step": 190, + "time_per_iteration": 3.0547502040863037 + }, + { + "auxiliary_loss_clip": 0.0170074, + "auxiliary_loss_mlp": 0.01552688, + "balance_loss_clip": 1.35046053, + "balance_loss_mlp": 1.18209028, + "epoch": 0.022966392112066376, + "flos": 15525364425120.0, + "grad_norm": 3.0276176081388027, + "language_loss": 0.89642096, + "learning_rate": 3.804988213213804e-06, + "loss": 0.9289552, + "num_input_tokens_seen": 3983975, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 3.703125, + "step": 191, + "time_per_iteration": 2.9620258808135986 + }, + { + "auxiliary_loss_clip": 0.01869299, + "auxiliary_loss_mlp": 0.02493074, + "balance_loss_clip": 1.52428102, + "balance_loss_mlp": 2.233675, + "epoch": 0.023086635002705466, + "flos": 55656109638720.0, + "grad_norm": 1.3633736151182547, + "language_loss": 0.63173449, + "learning_rate": 3.808771229382049e-06, + "loss": 0.67535818, + "num_input_tokens_seen": 4043440, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 2.59375, + "step": 192, + "time_per_iteration": 3.4137423038482666 + }, + { + "auxiliary_loss_clip": 0.01691793, + "auxiliary_loss_mlp": 0.01527566, + "balance_loss_clip": 1.34339833, + "balance_loss_mlp": 1.16116428, + "epoch": 0.023206877893344555, + "flos": 19315221989760.0, + "grad_norm": 2.1760616176117797, + "language_loss": 0.8460812, + "learning_rate": 3.8125345934296324e-06, + "loss": 0.8782748, + "num_input_tokens_seen": 4061750, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 3.66210938, + "step": 193, + "time_per_iteration": 3.0621259212493896 + }, + { + "auxiliary_loss_clip": 0.0169322, + "auxiliary_loss_mlp": 0.01550672, + "balance_loss_clip": 1.34243202, + "balance_loss_mlp": 1.17950213, + "epoch": 0.02332712078398365, + "flos": 23075040087360.0, + "grad_norm": 2.590065449598657, + "language_loss": 0.88173735, + "learning_rate": 3.81627850848061e-06, + "loss": 0.91417634, + "num_input_tokens_seen": 4082345, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 3.70898438, + "step": 194, + "time_per_iteration": 3.1244401931762695 + }, + { + "auxiliary_loss_clip": 0.01701773, + "auxiliary_loss_mlp": 0.01549516, + "balance_loss_clip": 1.35235548, + "balance_loss_mlp": 1.17662966, + "epoch": 0.02344736367462274, + "flos": 24428292387840.0, + "grad_norm": 2.27191427054532, + "language_loss": 0.86311781, + "learning_rate": 3.820003174525994e-06, + "loss": 0.89563072, + "num_input_tokens_seen": 4101770, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 3.72460938, + "step": 195, + "time_per_iteration": 3.0286731719970703 + }, + { + "auxiliary_loss_clip": 0.01693548, + "auxiliary_loss_mlp": 0.01521132, + "balance_loss_clip": 1.34380698, + "balance_loss_mlp": 1.14042544, + "epoch": 0.02356760656526183, + "flos": 21581945203680.0, + "grad_norm": 2.660049507377222, + "language_loss": 0.82804239, + "learning_rate": 3.823708788487851e-06, + "loss": 0.8601892, + "num_input_tokens_seen": 4118770, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 3.8046875, + "step": 196, + "time_per_iteration": 3.2478528022766113 + }, + { + "auxiliary_loss_clip": 0.01707254, + "auxiliary_loss_mlp": 0.01552689, + "balance_loss_clip": 1.35863113, + "balance_loss_mlp": 1.16530704, + "epoch": 0.02368784945590092, + "flos": 25196041853280.0, + "grad_norm": 2.1873540718451814, + "language_loss": 0.84640539, + "learning_rate": 3.827395544281781e-06, + "loss": 0.87900484, + "num_input_tokens_seen": 4141110, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 3.87109375, + "step": 197, + "time_per_iteration": 3.0730104446411133 + }, + { + "auxiliary_loss_clip": 0.01697595, + "auxiliary_loss_mlp": 0.01536355, + "balance_loss_clip": 1.34810376, + "balance_loss_mlp": 1.15164328, + "epoch": 0.02380809234654001, + "flos": 27564906695040.0, + "grad_norm": 1.9286117419474778, + "language_loss": 0.79167223, + "learning_rate": 3.831063632877802e-06, + "loss": 0.82401168, + "num_input_tokens_seen": 4161430, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 3.84375, + "step": 198, + "time_per_iteration": 3.0759410858154297 + }, + { + "auxiliary_loss_clip": 0.01708131, + "auxiliary_loss_mlp": 0.0153249, + "balance_loss_clip": 1.35941577, + "balance_loss_mlp": 1.13194633, + "epoch": 0.0239283352371791, + "flos": 18261756780480.0, + "grad_norm": 2.8929721937673794, + "language_loss": 0.7607671, + "learning_rate": 3.834713242359712e-06, + "loss": 0.79317331, + "num_input_tokens_seen": 4179260, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 4.00585938, + "step": 199, + "time_per_iteration": 3.0102953910827637 + }, + { + "auxiliary_loss_clip": 0.01691376, + "auxiliary_loss_mlp": 0.0154796, + "balance_loss_clip": 1.34134185, + "balance_loss_mlp": 1.15104115, + "epoch": 0.02404857812781819, + "flos": 21397119458400.0, + "grad_norm": 2.1570431356817377, + "language_loss": 0.87365043, + "learning_rate": 3.838344557982959e-06, + "loss": 0.90604383, + "num_input_tokens_seen": 4200640, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 3.97070312, + "step": 200, + "time_per_iteration": 3.079772472381592 + }, + { + "auxiliary_loss_clip": 0.01692342, + "auxiliary_loss_mlp": 0.01534503, + "balance_loss_clip": 1.3425045, + "balance_loss_mlp": 1.14464116, + "epoch": 0.024168821018457284, + "flos": 16656127953120.0, + "grad_norm": 2.969751522156824, + "language_loss": 0.84778464, + "learning_rate": 3.841957762231063e-06, + "loss": 0.8800531, + "num_input_tokens_seen": 4218170, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 3.89648438, + "step": 201, + "time_per_iteration": 3.084735631942749 + }, + { + "auxiliary_loss_clip": 0.01693076, + "auxiliary_loss_mlp": 0.01516977, + "balance_loss_clip": 1.3435576, + "balance_loss_mlp": 1.1252079, + "epoch": 0.024289063909096374, + "flos": 22823118698400.0, + "grad_norm": 2.1411504165205164, + "language_loss": 0.87986088, + "learning_rate": 3.8455530348706454e-06, + "loss": 0.91196144, + "num_input_tokens_seen": 4237770, + "router_z_loss_clip": 3.49023438, + "router_z_loss_mlp": 3.91601562, + "step": 202, + "time_per_iteration": 2.955965995788574 + }, + { + "auxiliary_loss_clip": 0.01697673, + "auxiliary_loss_mlp": 0.01570717, + "balance_loss_clip": 1.34827137, + "balance_loss_mlp": 1.17208159, + "epoch": 0.024409306799735464, + "flos": 17750518011360.0, + "grad_norm": 2.486716956450227, + "language_loss": 0.77392256, + "learning_rate": 3.849130553005099e-06, + "loss": 0.80660653, + "num_input_tokens_seen": 4255985, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 3.98828125, + "step": 203, + "time_per_iteration": 3.078613042831421 + }, + { + "auxiliary_loss_clip": 0.0169003, + "auxiliary_loss_mlp": 0.01567603, + "balance_loss_clip": 1.3403405, + "balance_loss_mlp": 1.16953909, + "epoch": 0.024529549690374557, + "flos": 21618356601600.0, + "grad_norm": 3.0763079794583073, + "language_loss": 0.83982182, + "learning_rate": 3.852690491126933e-06, + "loss": 0.87239814, + "num_input_tokens_seen": 4276035, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 3.98046875, + "step": 204, + "time_per_iteration": 2.982922315597534 + }, + { + "auxiliary_loss_clip": 0.01686988, + "auxiliary_loss_mlp": 0.01520221, + "balance_loss_clip": 1.33592606, + "balance_loss_mlp": 1.12482786, + "epoch": 0.024649792581013647, + "flos": 25553745973440.0, + "grad_norm": 3.715373969674288, + "language_loss": 0.91492409, + "learning_rate": 3.856233021168845e-06, + "loss": 0.94699615, + "num_input_tokens_seen": 4295730, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 3.95507812, + "step": 205, + "time_per_iteration": 3.047980308532715 + }, + { + "auxiliary_loss_clip": 0.01682972, + "auxiliary_loss_mlp": 0.01509634, + "balance_loss_clip": 1.33280206, + "balance_loss_mlp": 1.12053514, + "epoch": 0.024770035471652737, + "flos": 34498433204640.0, + "grad_norm": 2.423951894295594, + "language_loss": 0.91478151, + "learning_rate": 3.859758312553544e-06, + "loss": 0.94670761, + "num_input_tokens_seen": 4317950, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 3.88867188, + "step": 206, + "time_per_iteration": 3.8786075115203857 + }, + { + "auxiliary_loss_clip": 0.01691584, + "auxiliary_loss_mlp": 0.01515647, + "balance_loss_clip": 1.34202528, + "balance_loss_mlp": 1.12311506, + "epoch": 0.02489027836229183, + "flos": 21507491496960.0, + "grad_norm": 1.9594322527631933, + "language_loss": 0.9186635, + "learning_rate": 3.8632665322423735e-06, + "loss": 0.95073581, + "num_input_tokens_seen": 4337605, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 3.92382812, + "step": 207, + "time_per_iteration": 3.095228433609009 + }, + { + "auxiliary_loss_clip": 0.01688777, + "auxiliary_loss_mlp": 0.01525297, + "balance_loss_clip": 1.340433, + "balance_loss_mlp": 1.14363635, + "epoch": 0.02501052125293092, + "flos": 23221102888800.0, + "grad_norm": 2.593893152166965, + "language_loss": 0.85961926, + "learning_rate": 3.866757844782762e-06, + "loss": 0.89175999, + "num_input_tokens_seen": 4358110, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 3.8125, + "step": 208, + "time_per_iteration": 4.821847915649414 + }, + { + "auxiliary_loss_clip": 0.01694429, + "auxiliary_loss_mlp": 0.01517298, + "balance_loss_clip": 1.34427726, + "balance_loss_mlp": 1.13830829, + "epoch": 0.02513076414357001, + "flos": 26390942628480.0, + "grad_norm": 2.227869095116665, + "language_loss": 0.91683662, + "learning_rate": 3.870232412354527e-06, + "loss": 0.94895387, + "num_input_tokens_seen": 4374955, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 3.78710938, + "step": 209, + "time_per_iteration": 2.970158100128174 + }, + { + "auxiliary_loss_clip": 0.01681585, + "auxiliary_loss_mlp": 0.01521425, + "balance_loss_clip": 1.3310107, + "balance_loss_mlp": 1.14376998, + "epoch": 0.025251007034209103, + "flos": 13592184729120.0, + "grad_norm": 7.457228922138619, + "language_loss": 0.93009162, + "learning_rate": 3.873690394815086e-06, + "loss": 0.96212173, + "num_input_tokens_seen": 4391535, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 3.7734375, + "step": 210, + "time_per_iteration": 3.0542445182800293 + }, + { + "auxiliary_loss_clip": 0.01695431, + "auxiliary_loss_mlp": 0.0154091, + "balance_loss_clip": 1.34556532, + "balance_loss_mlp": 1.16211057, + "epoch": 0.025371249924848193, + "flos": 15051030120000.0, + "grad_norm": 2.447216527073415, + "language_loss": 0.91607851, + "learning_rate": 3.877131949743587e-06, + "loss": 0.94844192, + "num_input_tokens_seen": 4408400, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 3.78320312, + "step": 211, + "time_per_iteration": 2.9664299488067627 + }, + { + "auxiliary_loss_clip": 0.01688976, + "auxiliary_loss_mlp": 0.01541002, + "balance_loss_clip": 1.3386848, + "balance_loss_mlp": 1.16563582, + "epoch": 0.025491492815487283, + "flos": 25556059591200.0, + "grad_norm": 2.450387523786065, + "language_loss": 0.78178322, + "learning_rate": 3.880557232483993e-06, + "loss": 0.81408298, + "num_input_tokens_seen": 4427840, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 3.75, + "step": 212, + "time_per_iteration": 3.037104606628418 + }, + { + "auxiliary_loss_clip": 0.01679912, + "auxiliary_loss_mlp": 0.01525358, + "balance_loss_clip": 1.32866168, + "balance_loss_mlp": 1.16048205, + "epoch": 0.025611735706126376, + "flos": 20632821455520.0, + "grad_norm": 2.214748717611136, + "language_loss": 0.87144697, + "learning_rate": 3.883966396187164e-06, + "loss": 0.90349966, + "num_input_tokens_seen": 4447110, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 3.6484375, + "step": 213, + "time_per_iteration": 3.022231340408325 + }, + { + "auxiliary_loss_clip": 0.0169369, + "auxiliary_loss_mlp": 0.01545186, + "balance_loss_clip": 1.34335935, + "balance_loss_mlp": 1.17096448, + "epoch": 0.025731978596765466, + "flos": 19064286732960.0, + "grad_norm": 2.542701788928331, + "language_loss": 0.90219486, + "learning_rate": 3.887359591851937e-06, + "loss": 0.93458366, + "num_input_tokens_seen": 4464715, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 3.73828125, + "step": 214, + "time_per_iteration": 2.997699737548828 + }, + { + "auxiliary_loss_clip": 0.01688159, + "auxiliary_loss_mlp": 0.0153183, + "balance_loss_clip": 1.33996415, + "balance_loss_mlp": 1.16847968, + "epoch": 0.025852221487404556, + "flos": 22166006768640.0, + "grad_norm": 2.079550480802818, + "language_loss": 0.92320025, + "learning_rate": 3.890736968365265e-06, + "loss": 0.95540011, + "num_input_tokens_seen": 4485030, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 3.63476562, + "step": 215, + "time_per_iteration": 3.067925453186035 + }, + { + "auxiliary_loss_clip": 0.0168297, + "auxiliary_loss_mlp": 0.01538102, + "balance_loss_clip": 1.33245909, + "balance_loss_mlp": 1.17417943, + "epoch": 0.02597246437804365, + "flos": 26544401421120.0, + "grad_norm": 1.8899464437142683, + "language_loss": 0.85333967, + "learning_rate": 3.894098672541412e-06, + "loss": 0.88555038, + "num_input_tokens_seen": 4505935, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 3.63867188, + "step": 216, + "time_per_iteration": 3.0529844760894775 + }, + { + "auxiliary_loss_clip": 0.01686891, + "auxiliary_loss_mlp": 0.01538835, + "balance_loss_clip": 1.33677793, + "balance_loss_mlp": 1.16575742, + "epoch": 0.02609270726868274, + "flos": 32674449774240.0, + "grad_norm": 4.294136194587479, + "language_loss": 0.75405443, + "learning_rate": 3.89744484916025e-06, + "loss": 0.78631169, + "num_input_tokens_seen": 4527045, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 3.72851562, + "step": 217, + "time_per_iteration": 3.0982677936553955 + }, + { + "auxiliary_loss_clip": 0.01692143, + "auxiliary_loss_mlp": 0.01527891, + "balance_loss_clip": 1.34263861, + "balance_loss_mlp": 1.16358757, + "epoch": 0.02621295015932183, + "flos": 26245600462080.0, + "grad_norm": 2.4110677065097437, + "language_loss": 0.87609732, + "learning_rate": 3.900775641004673e-06, + "loss": 0.90829772, + "num_input_tokens_seen": 4546360, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 3.640625, + "step": 218, + "time_per_iteration": 3.090003252029419 + }, + { + "auxiliary_loss_clip": 0.01693036, + "auxiliary_loss_mlp": 0.0158084, + "balance_loss_clip": 1.34041643, + "balance_loss_mlp": 1.22664511, + "epoch": 0.026333193049960922, + "flos": 42924599460000.0, + "grad_norm": 3.014721518749449, + "language_loss": 0.74609214, + "learning_rate": 3.904091188897156e-06, + "loss": 0.77883089, + "num_input_tokens_seen": 4565495, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 3.54492188, + "step": 219, + "time_per_iteration": 3.171982765197754 + }, + { + "auxiliary_loss_clip": 0.01680009, + "auxiliary_loss_mlp": 0.01602572, + "balance_loss_clip": 1.32904232, + "balance_loss_mlp": 1.24494433, + "epoch": 0.026453435940600012, + "flos": 17965572864480.0, + "grad_norm": 2.445019531388293, + "language_loss": 0.81894994, + "learning_rate": 3.90739163173548e-06, + "loss": 0.85177577, + "num_input_tokens_seen": 4583330, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 3.57421875, + "step": 220, + "time_per_iteration": 2.9819655418395996 + }, + { + "auxiliary_loss_clip": 0.01677726, + "auxiliary_loss_mlp": 0.01557503, + "balance_loss_clip": 1.32667816, + "balance_loss_mlp": 1.19758582, + "epoch": 0.026573678831239102, + "flos": 18986040210240.0, + "grad_norm": 3.615673591874599, + "language_loss": 0.88352334, + "learning_rate": 3.910677106527646e-06, + "loss": 0.91587567, + "num_input_tokens_seen": 4600520, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 3.59960938, + "step": 221, + "time_per_iteration": 2.908266067504883 + }, + { + "auxiliary_loss_clip": 0.01684463, + "auxiliary_loss_mlp": 0.01536458, + "balance_loss_clip": 1.33327436, + "balance_loss_mlp": 1.16986597, + "epoch": 0.026693921721878195, + "flos": 29244003096960.0, + "grad_norm": 2.5199240499391733, + "language_loss": 0.84512019, + "learning_rate": 3.913947748426004e-06, + "loss": 0.87732941, + "num_input_tokens_seen": 4617340, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 3.6640625, + "step": 222, + "time_per_iteration": 2.989180326461792 + }, + { + "auxiliary_loss_clip": 0.01687508, + "auxiliary_loss_mlp": 0.01524127, + "balance_loss_clip": 1.3374331, + "balance_loss_mlp": 1.15486455, + "epoch": 0.026814164612517285, + "flos": 14129328795840.0, + "grad_norm": 3.209195994597729, + "language_loss": 0.77075964, + "learning_rate": 3.9172036907606136e-06, + "loss": 0.802876, + "num_input_tokens_seen": 4630820, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 3.69140625, + "step": 223, + "time_per_iteration": 3.0143072605133057 + }, + { + "auxiliary_loss_clip": 0.01677855, + "auxiliary_loss_mlp": 0.01552197, + "balance_loss_clip": 1.32562959, + "balance_loss_mlp": 1.19514108, + "epoch": 0.026934407503156375, + "flos": 23514138767520.0, + "grad_norm": 1.9313178071900674, + "language_loss": 0.952461, + "learning_rate": 3.920445065071855e-06, + "loss": 0.9847616, + "num_input_tokens_seen": 4651985, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 3.56835938, + "step": 224, + "time_per_iteration": 2.982846975326538 + }, + { + "auxiliary_loss_clip": 0.01676493, + "auxiliary_loss_mlp": 0.01561642, + "balance_loss_clip": 1.32605517, + "balance_loss_mlp": 1.20344162, + "epoch": 0.027054650393795468, + "flos": 28952939482560.0, + "grad_norm": 2.883871780133008, + "language_loss": 0.80218923, + "learning_rate": 3.923672001142322e-06, + "loss": 0.83457065, + "num_input_tokens_seen": 4672295, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 3.58007812, + "step": 225, + "time_per_iteration": 3.0293500423431396 + }, + { + "auxiliary_loss_clip": 0.01678096, + "auxiliary_loss_mlp": 0.01541157, + "balance_loss_clip": 1.32544136, + "balance_loss_mlp": 1.16827035, + "epoch": 0.027174893284434558, + "flos": 31434300339840.0, + "grad_norm": 2.2965300480356765, + "language_loss": 0.84582669, + "learning_rate": 3.926884627027996e-06, + "loss": 0.87801921, + "num_input_tokens_seen": 4696065, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 3.72460938, + "step": 226, + "time_per_iteration": 3.1869709491729736 + }, + { + "auxiliary_loss_clip": 0.01682815, + "auxiliary_loss_mlp": 0.01511206, + "balance_loss_clip": 1.33271885, + "balance_loss_mlp": 1.14251506, + "epoch": 0.027295136175073648, + "flos": 22056658790400.0, + "grad_norm": 2.1260458291401214, + "language_loss": 0.77656335, + "learning_rate": 3.930083069088744e-06, + "loss": 0.80850357, + "num_input_tokens_seen": 4716065, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 3.68554688, + "step": 227, + "time_per_iteration": 2.927825927734375 + }, + { + "auxiliary_loss_clip": 0.0181386, + "auxiliary_loss_mlp": 0.01517265, + "balance_loss_clip": 1.4600184, + "balance_loss_mlp": 1.09764862, + "epoch": 0.02741537906571274, + "flos": 60807070632960.0, + "grad_norm": 1.0160067673801332, + "language_loss": 0.59338987, + "learning_rate": 3.933267452018137e-06, + "loss": 0.62670112, + "num_input_tokens_seen": 4775860, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 4.203125, + "step": 228, + "time_per_iteration": 3.4845077991485596 + }, + { + "auxiliary_loss_clip": 0.01684363, + "auxiliary_loss_mlp": 0.01558732, + "balance_loss_clip": 1.33395493, + "balance_loss_mlp": 1.1944288, + "epoch": 0.02753562195635183, + "flos": 24608490897600.0, + "grad_norm": 2.665370662083213, + "language_loss": 0.844432, + "learning_rate": 3.936437898872622e-06, + "loss": 0.87686294, + "num_input_tokens_seen": 4795835, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 3.64453125, + "step": 229, + "time_per_iteration": 3.0022730827331543 + }, + { + "auxiliary_loss_clip": 0.01673891, + "auxiliary_loss_mlp": 0.0155216, + "balance_loss_clip": 1.32119632, + "balance_loss_mlp": 1.18480468, + "epoch": 0.02765586484699092, + "flos": 34097073408000.0, + "grad_norm": 3.3933314025525334, + "language_loss": 0.80024517, + "learning_rate": 3.9395945311000525e-06, + "loss": 0.83250576, + "num_input_tokens_seen": 4817460, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 3.671875, + "step": 230, + "time_per_iteration": 3.2758753299713135 + }, + { + "auxiliary_loss_clip": 0.01682951, + "auxiliary_loss_mlp": 0.01532191, + "balance_loss_clip": 1.33346331, + "balance_loss_mlp": 1.16788733, + "epoch": 0.027776107737630014, + "flos": 14831689384800.0, + "grad_norm": 2.3866668704086123, + "language_loss": 0.91018784, + "learning_rate": 3.942737468567608e-06, + "loss": 0.9423393, + "num_input_tokens_seen": 4835475, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 3.640625, + "step": 231, + "time_per_iteration": 3.1530206203460693 + }, + { + "auxiliary_loss_clip": 0.01673185, + "auxiliary_loss_mlp": 0.01539515, + "balance_loss_clip": 1.3207314, + "balance_loss_mlp": 1.17730904, + "epoch": 0.027896350628269104, + "flos": 47923846356960.0, + "grad_norm": 2.113894020186514, + "language_loss": 0.86240286, + "learning_rate": 3.9458668295891026e-06, + "loss": 0.89452988, + "num_input_tokens_seen": 4857760, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 3.62304688, + "step": 232, + "time_per_iteration": 3.248979330062866 + }, + { + "auxiliary_loss_clip": 0.01667967, + "auxiliary_loss_mlp": 0.01516494, + "balance_loss_clip": 1.31712472, + "balance_loss_mlp": 1.14627767, + "epoch": 0.028016593518908194, + "flos": 21686438377440.0, + "grad_norm": 3.052231470982136, + "language_loss": 0.86866552, + "learning_rate": 3.948982730951712e-06, + "loss": 0.90051013, + "num_input_tokens_seen": 4875855, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 3.69921875, + "step": 233, + "time_per_iteration": 3.044569730758667 + }, + { + "auxiliary_loss_clip": 0.01682922, + "auxiliary_loss_mlp": 0.01528158, + "balance_loss_clip": 1.33158839, + "balance_loss_mlp": 1.16576171, + "epoch": 0.028136836409547287, + "flos": 18441310511520.0, + "grad_norm": 3.14841736763461, + "language_loss": 0.82001358, + "learning_rate": 3.9520852879421254e-06, + "loss": 0.85212439, + "num_input_tokens_seen": 4893200, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 3.625, + "step": 234, + "time_per_iteration": 3.842411994934082 + }, + { + "auxiliary_loss_clip": 0.01676097, + "auxiliary_loss_mlp": 0.01544489, + "balance_loss_clip": 1.32482195, + "balance_loss_mlp": 1.19258332, + "epoch": 0.028257079300186377, + "flos": 31579642506240.0, + "grad_norm": 3.267301047088949, + "language_loss": 0.81825042, + "learning_rate": 3.955174614372137e-06, + "loss": 0.85045624, + "num_input_tokens_seen": 4912965, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 3.51953125, + "step": 235, + "time_per_iteration": 4.071439743041992 + }, + { + "auxiliary_loss_clip": 0.01678545, + "auxiliary_loss_mlp": 0.01561871, + "balance_loss_clip": 1.32753277, + "balance_loss_mlp": 1.19337106, + "epoch": 0.028377322190825467, + "flos": 23515580037600.0, + "grad_norm": 3.841172940629021, + "language_loss": 0.8464936, + "learning_rate": 3.9582508226037045e-06, + "loss": 0.87889779, + "num_input_tokens_seen": 4933105, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 3.68164062, + "step": 236, + "time_per_iteration": 4.7261269092559814 + }, + { + "auxiliary_loss_clip": 0.01676679, + "auxiliary_loss_mlp": 0.01568899, + "balance_loss_clip": 1.32646906, + "balance_loss_mlp": 1.21489525, + "epoch": 0.02849756508146456, + "flos": 20481372855360.0, + "grad_norm": 2.510498866568867, + "language_loss": 0.94078457, + "learning_rate": 3.9613140235734636e-06, + "loss": 0.97324038, + "num_input_tokens_seen": 4950085, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 3.5390625, + "step": 237, + "time_per_iteration": 3.0335419178009033 + }, + { + "auxiliary_loss_clip": 0.01678455, + "auxiliary_loss_mlp": 0.01548067, + "balance_loss_clip": 1.32771134, + "balance_loss_mlp": 1.18662453, + "epoch": 0.02861780797210365, + "flos": 14285139134400.0, + "grad_norm": 2.1223261790812558, + "language_loss": 0.81165099, + "learning_rate": 3.96436432681674e-06, + "loss": 0.84391624, + "num_input_tokens_seen": 4968075, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 3.61523438, + "step": 238, + "time_per_iteration": 3.0165860652923584 + }, + { + "auxiliary_loss_clip": 0.01673142, + "auxiliary_loss_mlp": 0.01557284, + "balance_loss_clip": 1.32248878, + "balance_loss_mlp": 1.19584179, + "epoch": 0.02873805086274274, + "flos": 25810711807680.0, + "grad_norm": 7.256479256972048, + "language_loss": 0.89267606, + "learning_rate": 3.967401840491044e-06, + "loss": 0.92498028, + "num_input_tokens_seen": 4987355, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 3.61132812, + "step": 239, + "time_per_iteration": 2.943526029586792 + }, + { + "auxiliary_loss_clip": 0.01675211, + "auxiliary_loss_mlp": 0.01558277, + "balance_loss_clip": 1.3240217, + "balance_loss_mlp": 1.19378257, + "epoch": 0.028858293753381833, + "flos": 17305768035360.0, + "grad_norm": 3.392640733186826, + "language_loss": 0.87608182, + "learning_rate": 3.97042667139909e-06, + "loss": 0.90841663, + "num_input_tokens_seen": 5004680, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 3.64257812, + "step": 240, + "time_per_iteration": 2.994819402694702 + }, + { + "auxiliary_loss_clip": 0.01676858, + "auxiliary_loss_mlp": 0.01512477, + "balance_loss_clip": 1.32513213, + "balance_loss_mlp": 1.14130664, + "epoch": 0.028978536644020923, + "flos": 23040525097440.0, + "grad_norm": 2.496220583026839, + "language_loss": 0.87818831, + "learning_rate": 3.973438925011327e-06, + "loss": 0.91008162, + "num_input_tokens_seen": 5022965, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 3.7109375, + "step": 241, + "time_per_iteration": 2.9795727729797363 + }, + { + "auxiliary_loss_clip": 0.01665463, + "auxiliary_loss_mlp": 0.01511682, + "balance_loss_clip": 1.31431127, + "balance_loss_mlp": 1.14108443, + "epoch": 0.029098779534660012, + "flos": 28332238950720.0, + "grad_norm": 2.447822856505699, + "language_loss": 0.91616994, + "learning_rate": 3.976438705488002e-06, + "loss": 0.9479413, + "num_input_tokens_seen": 5042625, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 3.703125, + "step": 242, + "time_per_iteration": 3.0530943870544434 + }, + { + "auxiliary_loss_clip": 0.0169012, + "auxiliary_loss_mlp": 0.01532955, + "balance_loss_clip": 1.33884454, + "balance_loss_mlp": 1.16464579, + "epoch": 0.029219022425299106, + "flos": 13883400056160.0, + "grad_norm": 2.7156329453152726, + "language_loss": 0.9319877, + "learning_rate": 3.9794261157007744e-06, + "loss": 0.96421838, + "num_input_tokens_seen": 5060380, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 3.6796875, + "step": 243, + "time_per_iteration": 2.9623630046844482 + }, + { + "auxiliary_loss_clip": 0.01689319, + "auxiliary_loss_mlp": 0.01540399, + "balance_loss_clip": 1.33825767, + "balance_loss_mlp": 1.16732156, + "epoch": 0.029339265315938196, + "flos": 19424266542720.0, + "grad_norm": 4.30714396896619, + "language_loss": 0.85086751, + "learning_rate": 3.982401257253887e-06, + "loss": 0.88316464, + "num_input_tokens_seen": 5078720, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 3.72851562, + "step": 244, + "time_per_iteration": 3.020616292953491 + }, + { + "auxiliary_loss_clip": 0.01676137, + "auxiliary_loss_mlp": 0.01518588, + "balance_loss_clip": 1.32463598, + "balance_loss_mlp": 1.14303136, + "epoch": 0.029459508206577285, + "flos": 15671389298400.0, + "grad_norm": 3.3711579130007836, + "language_loss": 0.89877254, + "learning_rate": 3.985364230504893e-06, + "loss": 0.93071985, + "num_input_tokens_seen": 5096605, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 3.75195312, + "step": 245, + "time_per_iteration": 3.1095879077911377 + }, + { + "auxiliary_loss_clip": 0.016863, + "auxiliary_loss_mlp": 0.015359, + "balance_loss_clip": 1.33432388, + "balance_loss_mlp": 1.17045152, + "epoch": 0.02957975109721638, + "flos": 28223535751200.0, + "grad_norm": 2.877415984771378, + "language_loss": 0.84443557, + "learning_rate": 3.988315134584976e-06, + "loss": 0.87665761, + "num_input_tokens_seen": 5116285, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 3.65234375, + "step": 246, + "time_per_iteration": 3.021153688430786 + }, + { + "auxiliary_loss_clip": 0.01673676, + "auxiliary_loss_mlp": 0.01523838, + "balance_loss_clip": 1.32175303, + "balance_loss_mlp": 1.15209615, + "epoch": 0.02969999398785547, + "flos": 24318147918240.0, + "grad_norm": 3.605682989909795, + "language_loss": 0.80500591, + "learning_rate": 3.991254067418851e-06, + "loss": 0.83698106, + "num_input_tokens_seen": 5136825, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 3.71484375, + "step": 247, + "time_per_iteration": 2.9672114849090576 + }, + { + "auxiliary_loss_clip": 0.01677029, + "auxiliary_loss_mlp": 0.01528907, + "balance_loss_clip": 1.32576084, + "balance_loss_mlp": 1.16097903, + "epoch": 0.02982023687849456, + "flos": 35081432781120.0, + "grad_norm": 2.599653837754337, + "language_loss": 0.83042812, + "learning_rate": 3.994181125744254e-06, + "loss": 0.86248755, + "num_input_tokens_seen": 5158630, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 3.67578125, + "step": 248, + "time_per_iteration": 3.0947835445404053 + }, + { + "auxiliary_loss_clip": 0.01674932, + "auxiliary_loss_mlp": 0.01532463, + "balance_loss_clip": 1.32275248, + "balance_loss_mlp": 1.15805018, + "epoch": 0.02994047976913365, + "flos": 26179566806880.0, + "grad_norm": 2.335634282426652, + "language_loss": 0.74317288, + "learning_rate": 3.99709640513106e-06, + "loss": 0.77524686, + "num_input_tokens_seen": 5179510, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 3.73828125, + "step": 249, + "time_per_iteration": 2.9580206871032715 + }, + { + "auxiliary_loss_clip": 0.0167818, + "auxiliary_loss_mlp": 0.01517508, + "balance_loss_clip": 1.32594228, + "balance_loss_mlp": 1.1524415, + "epoch": 0.03006072265977274, + "flos": 25627137691680.0, + "grad_norm": 2.8952043662832296, + "language_loss": 0.85528761, + "learning_rate": 4e-06, + "loss": 0.88724446, + "num_input_tokens_seen": 5199345, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 3.6484375, + "step": 250, + "time_per_iteration": 3.0514156818389893 + }, + { + "auxiliary_loss_clip": 0.01685344, + "auxiliary_loss_mlp": 0.01561039, + "balance_loss_clip": 1.33468831, + "balance_loss_mlp": 1.1954, + "epoch": 0.03018096555041183, + "flos": 22129254017280.0, + "grad_norm": 3.376571809373255, + "language_loss": 0.8883003, + "learning_rate": 3.999999848300794e-06, + "loss": 0.92076403, + "num_input_tokens_seen": 5218330, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 3.65429688, + "step": 251, + "time_per_iteration": 2.981887102127075 + }, + { + "auxiliary_loss_clip": 0.0167456, + "auxiliary_loss_mlp": 0.01536475, + "balance_loss_clip": 1.32320762, + "balance_loss_mlp": 1.17178988, + "epoch": 0.030301208441050925, + "flos": 30190699442880.0, + "grad_norm": 1.7704204276709004, + "language_loss": 0.89438188, + "learning_rate": 3.999999393203203e-06, + "loss": 0.92649221, + "num_input_tokens_seen": 5240740, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 3.64648438, + "step": 252, + "time_per_iteration": 3.009105682373047 + }, + { + "auxiliary_loss_clip": 0.01669671, + "auxiliary_loss_mlp": 0.01532617, + "balance_loss_clip": 1.31691372, + "balance_loss_mlp": 1.16812253, + "epoch": 0.030421451331690014, + "flos": 23623562602080.0, + "grad_norm": 2.40615170527551, + "language_loss": 0.85421664, + "learning_rate": 3.999998634707293e-06, + "loss": 0.88623953, + "num_input_tokens_seen": 5260290, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 3.64648438, + "step": 253, + "time_per_iteration": 3.0588927268981934 + }, + { + "auxiliary_loss_clip": 0.01688891, + "auxiliary_loss_mlp": 0.01546226, + "balance_loss_clip": 1.33655095, + "balance_loss_mlp": 1.18020558, + "epoch": 0.030541694222329104, + "flos": 27930955010400.0, + "grad_norm": 2.4486034593754282, + "language_loss": 0.96699905, + "learning_rate": 3.999997572813182e-06, + "loss": 0.99935019, + "num_input_tokens_seen": 5278100, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 3.66015625, + "step": 254, + "time_per_iteration": 3.0166637897491455 + }, + { + "auxiliary_loss_clip": 0.01668393, + "auxiliary_loss_mlp": 0.01539497, + "balance_loss_clip": 1.3159399, + "balance_loss_mlp": 1.16641927, + "epoch": 0.030661937112968194, + "flos": 18590445493920.0, + "grad_norm": 2.759883871702406, + "language_loss": 0.87534702, + "learning_rate": 3.999996207521028e-06, + "loss": 0.90742594, + "num_input_tokens_seen": 5296810, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 3.73046875, + "step": 255, + "time_per_iteration": 3.113816976547241 + }, + { + "auxiliary_loss_clip": 0.01674955, + "auxiliary_loss_mlp": 0.01538262, + "balance_loss_clip": 1.32208884, + "balance_loss_mlp": 1.17243242, + "epoch": 0.030782180003607287, + "flos": 12970801490400.0, + "grad_norm": 2.5500887043087586, + "language_loss": 0.8182807, + "learning_rate": 3.999994538831039e-06, + "loss": 0.85041285, + "num_input_tokens_seen": 5313395, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 3.65820312, + "step": 256, + "time_per_iteration": 3.1043739318847656 + }, + { + "auxiliary_loss_clip": 0.01670036, + "auxiliary_loss_mlp": 0.01520257, + "balance_loss_clip": 1.31654429, + "balance_loss_mlp": 1.15175688, + "epoch": 0.030902422894246377, + "flos": 23337846858240.0, + "grad_norm": 3.5684080944810717, + "language_loss": 0.85869634, + "learning_rate": 3.99999256674347e-06, + "loss": 0.89059925, + "num_input_tokens_seen": 5333545, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 3.68164062, + "step": 257, + "time_per_iteration": 3.01969575881958 + }, + { + "auxiliary_loss_clip": 0.01805336, + "auxiliary_loss_mlp": 0.01515663, + "balance_loss_clip": 1.45068216, + "balance_loss_mlp": 1.17386627, + "epoch": 0.031022665784885467, + "flos": 55099318785120.0, + "grad_norm": 1.0855008500303795, + "language_loss": 0.53531229, + "learning_rate": 3.999990291258618e-06, + "loss": 0.56852221, + "num_input_tokens_seen": 5392235, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 3.421875, + "step": 258, + "time_per_iteration": 3.4728193283081055 + }, + { + "auxiliary_loss_clip": 0.0167771, + "auxiliary_loss_mlp": 0.01533518, + "balance_loss_clip": 1.32331419, + "balance_loss_mlp": 1.150141, + "epoch": 0.03114290867552456, + "flos": 19319811297120.0, + "grad_norm": 2.5811831388589406, + "language_loss": 0.86784691, + "learning_rate": 3.999987712376829e-06, + "loss": 0.89995921, + "num_input_tokens_seen": 5410555, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 3.828125, + "step": 259, + "time_per_iteration": 2.956465244293213 + }, + { + "auxiliary_loss_clip": 0.01679219, + "auxiliary_loss_mlp": 0.01569405, + "balance_loss_clip": 1.32682943, + "balance_loss_mlp": 1.1972816, + "epoch": 0.031263151566163654, + "flos": 20961548097120.0, + "grad_norm": 3.4457178175369907, + "language_loss": 0.8254509, + "learning_rate": 3.999984830098494e-06, + "loss": 0.85793716, + "num_input_tokens_seen": 5430135, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 3.72070312, + "step": 260, + "time_per_iteration": 2.945455312728882 + }, + { + "auxiliary_loss_clip": 0.01677326, + "auxiliary_loss_mlp": 0.01540038, + "balance_loss_clip": 1.32471347, + "balance_loss_mlp": 1.1570425, + "epoch": 0.03138339445680274, + "flos": 14794746992640.0, + "grad_norm": 4.726857133891836, + "language_loss": 0.98239529, + "learning_rate": 3.999981644424051e-06, + "loss": 1.01456892, + "num_input_tokens_seen": 5444935, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 3.828125, + "step": 261, + "time_per_iteration": 3.7717206478118896 + }, + { + "auxiliary_loss_clip": 0.01677619, + "auxiliary_loss_mlp": 0.01566382, + "balance_loss_clip": 1.32524025, + "balance_loss_mlp": 1.18186069, + "epoch": 0.03150363734744183, + "flos": 11657184481440.0, + "grad_norm": 5.025320466794321, + "language_loss": 0.86452949, + "learning_rate": 3.999978155353982e-06, + "loss": 0.8969695, + "num_input_tokens_seen": 5462080, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 3.84179688, + "step": 262, + "time_per_iteration": 3.872612714767456 + }, + { + "auxiliary_loss_clip": 0.0167431, + "auxiliary_loss_mlp": 0.01516611, + "balance_loss_clip": 1.32142639, + "balance_loss_mlp": 1.13208938, + "epoch": 0.03162388023808092, + "flos": 33730456170240.0, + "grad_norm": 2.775971716663076, + "language_loss": 0.80280197, + "learning_rate": 3.9999743628888186e-06, + "loss": 0.83471119, + "num_input_tokens_seen": 5483870, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 3.84179688, + "step": 263, + "time_per_iteration": 3.8975555896759033 + }, + { + "auxiliary_loss_clip": 0.01672672, + "auxiliary_loss_mlp": 0.01512967, + "balance_loss_clip": 1.32061434, + "balance_loss_mlp": 1.13206995, + "epoch": 0.03174412312872001, + "flos": 20812906180800.0, + "grad_norm": 2.394313937538142, + "language_loss": 0.89582467, + "learning_rate": 3.999970267029133e-06, + "loss": 0.92768109, + "num_input_tokens_seen": 5502830, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 3.8046875, + "step": 264, + "time_per_iteration": 3.164794921875 + }, + { + "auxiliary_loss_clip": 0.01671522, + "auxiliary_loss_mlp": 0.01527961, + "balance_loss_clip": 1.317312, + "balance_loss_mlp": 1.14763618, + "epoch": 0.0318643660193591, + "flos": 23729952183840.0, + "grad_norm": 4.039878995772948, + "language_loss": 0.80147004, + "learning_rate": 3.999965867775548e-06, + "loss": 0.83346486, + "num_input_tokens_seen": 5523225, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 3.80078125, + "step": 265, + "time_per_iteration": 3.1653246879577637 + }, + { + "auxiliary_loss_clip": 0.01669845, + "auxiliary_loss_mlp": 0.01519947, + "balance_loss_clip": 1.316535, + "balance_loss_mlp": 1.13389969, + "epoch": 0.0319846089099982, + "flos": 13919583885120.0, + "grad_norm": 10.017381710638789, + "language_loss": 0.86758471, + "learning_rate": 3.9999611651287315e-06, + "loss": 0.89948267, + "num_input_tokens_seen": 5541380, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 3.859375, + "step": 266, + "time_per_iteration": 3.0095016956329346 + }, + { + "auxiliary_loss_clip": 0.0167026, + "auxiliary_loss_mlp": 0.01549395, + "balance_loss_clip": 1.31654215, + "balance_loss_mlp": 1.15381122, + "epoch": 0.03210485180063729, + "flos": 14754808275840.0, + "grad_norm": 2.6722248329986975, + "language_loss": 0.78901279, + "learning_rate": 3.999956159089396e-06, + "loss": 0.82120937, + "num_input_tokens_seen": 5558830, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 3.953125, + "step": 267, + "time_per_iteration": 3.0430707931518555 + }, + { + "auxiliary_loss_clip": 0.01675828, + "auxiliary_loss_mlp": 0.01571598, + "balance_loss_clip": 1.32342911, + "balance_loss_mlp": 1.18993747, + "epoch": 0.03222509469127638, + "flos": 28915693665120.0, + "grad_norm": 3.0151261988836344, + "language_loss": 0.79851407, + "learning_rate": 3.999950849658302e-06, + "loss": 0.83098829, + "num_input_tokens_seen": 5577750, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 3.8125, + "step": 268, + "time_per_iteration": 3.0502371788024902 + }, + { + "auxiliary_loss_clip": 0.01671711, + "auxiliary_loss_mlp": 0.01525462, + "balance_loss_clip": 1.31713176, + "balance_loss_mlp": 1.13502741, + "epoch": 0.03234533758191547, + "flos": 16948291484160.0, + "grad_norm": 2.9461689973922374, + "language_loss": 0.84431684, + "learning_rate": 3.999945236836254e-06, + "loss": 0.87628853, + "num_input_tokens_seen": 5596715, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 3.90429688, + "step": 269, + "time_per_iteration": 3.0144081115722656 + }, + { + "auxiliary_loss_clip": 0.01676065, + "auxiliary_loss_mlp": 0.01553924, + "balance_loss_clip": 1.32260013, + "balance_loss_mlp": 1.17245448, + "epoch": 0.03246558047255456, + "flos": 18991084655520.0, + "grad_norm": 6.761491613515846, + "language_loss": 0.94960803, + "learning_rate": 3.999939320624103e-06, + "loss": 0.98190796, + "num_input_tokens_seen": 5611865, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 3.81054688, + "step": 270, + "time_per_iteration": 3.0272879600524902 + }, + { + "auxiliary_loss_clip": 0.01672909, + "auxiliary_loss_mlp": 0.01537952, + "balance_loss_clip": 1.3187952, + "balance_loss_mlp": 1.1543839, + "epoch": 0.03258582336319365, + "flos": 23730369393600.0, + "grad_norm": 1.8631341372297676, + "language_loss": 0.89716208, + "learning_rate": 3.999933101022749e-06, + "loss": 0.92927074, + "num_input_tokens_seen": 5632270, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 3.83203125, + "step": 271, + "time_per_iteration": 3.054224967956543 + }, + { + "auxiliary_loss_clip": 0.01672206, + "auxiliary_loss_mlp": 0.01568896, + "balance_loss_clip": 1.31750977, + "balance_loss_mlp": 1.2005868, + "epoch": 0.032706066253832745, + "flos": 27673192684800.0, + "grad_norm": 1.9542701769335924, + "language_loss": 0.87005621, + "learning_rate": 3.999926578033132e-06, + "loss": 0.90246719, + "num_input_tokens_seen": 5652085, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 3.6796875, + "step": 272, + "time_per_iteration": 3.098949432373047 + }, + { + "auxiliary_loss_clip": 0.01674068, + "auxiliary_loss_mlp": 0.01526328, + "balance_loss_clip": 1.31999683, + "balance_loss_mlp": 1.14848232, + "epoch": 0.032826309144471835, + "flos": 45628221520800.0, + "grad_norm": 2.5101047940257897, + "language_loss": 0.63316131, + "learning_rate": 3.999919751656244e-06, + "loss": 0.66516531, + "num_input_tokens_seen": 5678985, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 3.77539062, + "step": 273, + "time_per_iteration": 3.201871156692505 + }, + { + "auxiliary_loss_clip": 0.01675879, + "auxiliary_loss_mlp": 0.01523673, + "balance_loss_clip": 1.32254493, + "balance_loss_mlp": 1.14277554, + "epoch": 0.032946552035110925, + "flos": 25814770120800.0, + "grad_norm": 4.576826587127072, + "language_loss": 0.75741971, + "learning_rate": 3.9999126218931195e-06, + "loss": 0.78941524, + "num_input_tokens_seen": 5697020, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 3.80664062, + "step": 274, + "time_per_iteration": 3.099724531173706 + }, + { + "auxiliary_loss_clip": 0.01688773, + "auxiliary_loss_mlp": 0.01515781, + "balance_loss_clip": 1.33738315, + "balance_loss_mlp": 1.13316691, + "epoch": 0.033066794925750015, + "flos": 15123777059520.0, + "grad_norm": 2.926923397145629, + "language_loss": 0.89759004, + "learning_rate": 3.99990518874484e-06, + "loss": 0.92963558, + "num_input_tokens_seen": 5713460, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 3.82421875, + "step": 275, + "time_per_iteration": 3.1120822429656982 + }, + { + "auxiliary_loss_clip": 0.01682516, + "auxiliary_loss_mlp": 0.01551408, + "balance_loss_clip": 1.32824731, + "balance_loss_mlp": 1.17585135, + "epoch": 0.033187037816389105, + "flos": 22778401033440.0, + "grad_norm": 2.4663492477055406, + "language_loss": 0.9259401, + "learning_rate": 3.999897452212534e-06, + "loss": 0.95827931, + "num_input_tokens_seen": 5730790, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 3.75195312, + "step": 276, + "time_per_iteration": 3.123070001602173 + }, + { + "auxiliary_loss_clip": 0.0167571, + "auxiliary_loss_mlp": 0.01546589, + "balance_loss_clip": 1.32157052, + "balance_loss_mlp": 1.16817153, + "epoch": 0.033307280707028195, + "flos": 23333674760640.0, + "grad_norm": 6.112040838932997, + "language_loss": 1.00223255, + "learning_rate": 3.999889412297374e-06, + "loss": 1.03445554, + "num_input_tokens_seen": 5750215, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 3.78320312, + "step": 277, + "time_per_iteration": 3.060206651687622 + }, + { + "auxiliary_loss_clip": 0.01668267, + "auxiliary_loss_mlp": 0.01501116, + "balance_loss_clip": 1.31455255, + "balance_loss_mlp": 1.12765694, + "epoch": 0.03342752359766729, + "flos": 28842832941120.0, + "grad_norm": 1.9817059533958366, + "language_loss": 0.78995466, + "learning_rate": 3.999881069000581e-06, + "loss": 0.82164848, + "num_input_tokens_seen": 5769945, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 3.73046875, + "step": 278, + "time_per_iteration": 2.994198799133301 + }, + { + "auxiliary_loss_clip": 0.0168062, + "auxiliary_loss_mlp": 0.01528275, + "balance_loss_clip": 1.32556045, + "balance_loss_mlp": 1.16378069, + "epoch": 0.03354776648830638, + "flos": 19386224233920.0, + "grad_norm": 4.090901086314728, + "language_loss": 0.86968756, + "learning_rate": 3.99987242232342e-06, + "loss": 0.90177655, + "num_input_tokens_seen": 5784950, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 3.64257812, + "step": 279, + "time_per_iteration": 2.932492971420288 + }, + { + "auxiliary_loss_clip": 0.01688832, + "auxiliary_loss_mlp": 0.01542715, + "balance_loss_clip": 1.33631659, + "balance_loss_mlp": 1.16658616, + "epoch": 0.03366800937894547, + "flos": 17860586624640.0, + "grad_norm": 2.07953368787367, + "language_loss": 0.80249488, + "learning_rate": 3.9998634722672026e-06, + "loss": 0.83481038, + "num_input_tokens_seen": 5805005, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 3.75976562, + "step": 280, + "time_per_iteration": 3.0213704109191895 + }, + { + "auxiliary_loss_clip": 0.01686512, + "auxiliary_loss_mlp": 0.01541312, + "balance_loss_clip": 1.33348513, + "balance_loss_mlp": 1.16861653, + "epoch": 0.03378825226958456, + "flos": 35953751276640.0, + "grad_norm": 2.4048325457959896, + "language_loss": 0.7884022, + "learning_rate": 3.999854218833286e-06, + "loss": 0.82068044, + "num_input_tokens_seen": 5825825, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 3.72265625, + "step": 281, + "time_per_iteration": 3.190051555633545 + }, + { + "auxiliary_loss_clip": 0.01676609, + "auxiliary_loss_mlp": 0.01520238, + "balance_loss_clip": 1.32312822, + "balance_loss_mlp": 1.15326428, + "epoch": 0.03390849516022365, + "flos": 25704853220160.0, + "grad_norm": 3.084721899137967, + "language_loss": 0.81930995, + "learning_rate": 3.999844662023075e-06, + "loss": 0.85127842, + "num_input_tokens_seen": 5845700, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 3.66601562, + "step": 282, + "time_per_iteration": 2.9960856437683105 + }, + { + "auxiliary_loss_clip": 0.01672697, + "auxiliary_loss_mlp": 0.0152315, + "balance_loss_clip": 1.31874204, + "balance_loss_mlp": 1.1542691, + "epoch": 0.03402873805086274, + "flos": 21286443994560.0, + "grad_norm": 3.200202955836336, + "language_loss": 0.92383969, + "learning_rate": 3.999834801838018e-06, + "loss": 0.95579815, + "num_input_tokens_seen": 5864680, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 3.6875, + "step": 283, + "time_per_iteration": 3.018956422805786 + }, + { + "auxiliary_loss_clip": 0.01683565, + "auxiliary_loss_mlp": 0.0153467, + "balance_loss_clip": 1.33069992, + "balance_loss_mlp": 1.16941309, + "epoch": 0.03414898094150183, + "flos": 22713125941440.0, + "grad_norm": 2.066813892152535, + "language_loss": 0.74153429, + "learning_rate": 3.9998246382796115e-06, + "loss": 0.77371669, + "num_input_tokens_seen": 5884260, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 3.65429688, + "step": 284, + "time_per_iteration": 3.0255720615386963 + }, + { + "auxiliary_loss_clip": 0.01675361, + "auxiliary_loss_mlp": 0.01508105, + "balance_loss_clip": 1.32166421, + "balance_loss_mlp": 1.13865137, + "epoch": 0.03426922383214093, + "flos": 18881926318080.0, + "grad_norm": 2.54837483432196, + "language_loss": 0.91270626, + "learning_rate": 3.999814171349399e-06, + "loss": 0.94454086, + "num_input_tokens_seen": 5902120, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 3.69335938, + "step": 285, + "time_per_iteration": 3.0659115314483643 + }, + { + "auxiliary_loss_clip": 0.01673294, + "auxiliary_loss_mlp": 0.01510566, + "balance_loss_clip": 1.32018375, + "balance_loss_mlp": 1.14435506, + "epoch": 0.03438946672278002, + "flos": 34754943900960.0, + "grad_norm": 2.242642737294601, + "language_loss": 0.73880255, + "learning_rate": 3.9998034010489655e-06, + "loss": 0.77064115, + "num_input_tokens_seen": 5925810, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 3.65820312, + "step": 286, + "time_per_iteration": 3.111095905303955 + }, + { + "auxiliary_loss_clip": 0.01684802, + "auxiliary_loss_mlp": 0.015124, + "balance_loss_clip": 1.33171296, + "balance_loss_mlp": 1.14428163, + "epoch": 0.03450970961341911, + "flos": 22166310193920.0, + "grad_norm": 2.9819932334456904, + "language_loss": 0.75761902, + "learning_rate": 3.999792327379946e-06, + "loss": 0.78959101, + "num_input_tokens_seen": 5945185, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 3.6796875, + "step": 287, + "time_per_iteration": 3.150343418121338 + }, + { + "auxiliary_loss_clip": 0.01682412, + "auxiliary_loss_mlp": 0.01545718, + "balance_loss_clip": 1.32877922, + "balance_loss_mlp": 1.17988837, + "epoch": 0.034629952504058197, + "flos": 21727894220640.0, + "grad_norm": 2.1009183240400957, + "language_loss": 0.96407026, + "learning_rate": 3.999780950344021e-06, + "loss": 0.99635154, + "num_input_tokens_seen": 5963375, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 3.65820312, + "step": 288, + "time_per_iteration": 3.8873403072357178 + }, + { + "auxiliary_loss_clip": 0.01672495, + "auxiliary_loss_mlp": 0.01553763, + "balance_loss_clip": 1.31780219, + "balance_loss_mlp": 1.18678916, + "epoch": 0.034750195394697286, + "flos": 20050087376160.0, + "grad_norm": 7.3341656314457015, + "language_loss": 0.82748944, + "learning_rate": 3.999769269942916e-06, + "loss": 0.85975206, + "num_input_tokens_seen": 5983415, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 3.66796875, + "step": 289, + "time_per_iteration": 3.003103017807007 + }, + { + "auxiliary_loss_clip": 0.01674151, + "auxiliary_loss_mlp": 0.01555834, + "balance_loss_clip": 1.31946588, + "balance_loss_mlp": 1.18142188, + "epoch": 0.034870438285336376, + "flos": 27968428396800.0, + "grad_norm": 3.0948211947454443, + "language_loss": 0.81126869, + "learning_rate": 3.999757286178402e-06, + "loss": 0.8435685, + "num_input_tokens_seen": 6005850, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 3.74023438, + "step": 290, + "time_per_iteration": 4.6985321044921875 + }, + { + "auxiliary_loss_clip": 0.01683278, + "auxiliary_loss_mlp": 0.01532788, + "balance_loss_clip": 1.33044338, + "balance_loss_mlp": 1.1703918, + "epoch": 0.03499068117597547, + "flos": 22019678470080.0, + "grad_norm": 2.1672068641778477, + "language_loss": 0.90695953, + "learning_rate": 3.999744999052299e-06, + "loss": 0.93912023, + "num_input_tokens_seen": 6027240, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 3.62304688, + "step": 291, + "time_per_iteration": 3.8878014087677 + }, + { + "auxiliary_loss_clip": 0.0182301, + "auxiliary_loss_mlp": 0.01502861, + "balance_loss_clip": 1.47123885, + "balance_loss_mlp": 1.1465683, + "epoch": 0.03511092406661456, + "flos": 57247174052640.0, + "grad_norm": 0.9920252822106833, + "language_loss": 0.61137271, + "learning_rate": 3.9997324085664675e-06, + "loss": 0.64463139, + "num_input_tokens_seen": 6087470, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 3.5703125, + "step": 292, + "time_per_iteration": 3.428621768951416 + }, + { + "auxiliary_loss_clip": 0.01667035, + "auxiliary_loss_mlp": 0.01539377, + "balance_loss_clip": 1.31288671, + "balance_loss_mlp": 1.17698085, + "epoch": 0.03523116695725365, + "flos": 22930190987040.0, + "grad_norm": 34.64075395574347, + "language_loss": 0.92062068, + "learning_rate": 3.999719514722821e-06, + "loss": 0.95268476, + "num_input_tokens_seen": 6107600, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 3.62109375, + "step": 293, + "time_per_iteration": 3.1110682487487793 + }, + { + "auxiliary_loss_clip": 0.01676186, + "auxiliary_loss_mlp": 0.01521518, + "balance_loss_clip": 1.32210422, + "balance_loss_mlp": 1.14977527, + "epoch": 0.03535140984789274, + "flos": 36906212702880.0, + "grad_norm": 2.596366267256512, + "language_loss": 0.74810672, + "learning_rate": 3.999706317523314e-06, + "loss": 0.78008366, + "num_input_tokens_seen": 6126160, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 3.71289062, + "step": 294, + "time_per_iteration": 3.1157429218292236 + }, + { + "auxiliary_loss_clip": 0.01674457, + "auxiliary_loss_mlp": 0.01538331, + "balance_loss_clip": 1.31982493, + "balance_loss_mlp": 1.16906857, + "epoch": 0.03547165273853183, + "flos": 20451257532000.0, + "grad_norm": 2.4630853735453524, + "language_loss": 0.8611663, + "learning_rate": 3.999692816969948e-06, + "loss": 0.89329416, + "num_input_tokens_seen": 6145695, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 3.68945312, + "step": 295, + "time_per_iteration": 3.0925540924072266 + }, + { + "auxiliary_loss_clip": 0.01815453, + "auxiliary_loss_mlp": 0.0148732, + "balance_loss_clip": 1.46356881, + "balance_loss_mlp": 1.15544128, + "epoch": 0.03559189562917092, + "flos": 69857464739040.0, + "grad_norm": 1.0199163378532976, + "language_loss": 0.6945504, + "learning_rate": 3.999679013064772e-06, + "loss": 0.72757816, + "num_input_tokens_seen": 6212440, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 3.3203125, + "step": 296, + "time_per_iteration": 3.439716339111328 + }, + { + "auxiliary_loss_clip": 0.01673817, + "auxiliary_loss_mlp": 0.01543741, + "balance_loss_clip": 1.32177663, + "balance_loss_mlp": 1.18001008, + "epoch": 0.03571213851981002, + "flos": 21654047364480.0, + "grad_norm": 4.498594553453977, + "language_loss": 0.85963511, + "learning_rate": 3.99966490580988e-06, + "loss": 0.89181066, + "num_input_tokens_seen": 6229800, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 3.63867188, + "step": 297, + "time_per_iteration": 2.970714569091797 + }, + { + "auxiliary_loss_clip": 0.01682736, + "auxiliary_loss_mlp": 0.0153746, + "balance_loss_clip": 1.33046269, + "balance_loss_mlp": 1.16991448, + "epoch": 0.03583238141044911, + "flos": 43949011334400.0, + "grad_norm": 7.608154100049932, + "language_loss": 0.66095114, + "learning_rate": 3.999650495207411e-06, + "loss": 0.69315308, + "num_input_tokens_seen": 6255825, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 3.671875, + "step": 298, + "time_per_iteration": 3.227905750274658 + }, + { + "auxiliary_loss_clip": 0.01681252, + "auxiliary_loss_mlp": 0.01547057, + "balance_loss_clip": 1.32813168, + "balance_loss_mlp": 1.18446994, + "epoch": 0.0359526243010882, + "flos": 18912496779360.0, + "grad_norm": 3.182629027215172, + "language_loss": 0.90632939, + "learning_rate": 3.999635781259553e-06, + "loss": 0.93861252, + "num_input_tokens_seen": 6271090, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 3.62695312, + "step": 299, + "time_per_iteration": 2.944122314453125 + }, + { + "auxiliary_loss_clip": 0.01788048, + "auxiliary_loss_mlp": 0.01493538, + "balance_loss_clip": 1.43684828, + "balance_loss_mlp": 1.16165924, + "epoch": 0.03607286719172729, + "flos": 61674837749280.0, + "grad_norm": 0.9425465163071134, + "language_loss": 0.52264297, + "learning_rate": 3.999620763968535e-06, + "loss": 0.55545878, + "num_input_tokens_seen": 6329965, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 3.3203125, + "step": 300, + "time_per_iteration": 3.2692341804504395 + }, + { + "auxiliary_loss_clip": 0.01677163, + "auxiliary_loss_mlp": 0.01536041, + "balance_loss_clip": 1.32592845, + "balance_loss_mlp": 1.16811395, + "epoch": 0.03619311008236638, + "flos": 27821569104000.0, + "grad_norm": 1.9488959637014054, + "language_loss": 0.86589706, + "learning_rate": 3.999605443336638e-06, + "loss": 0.89802909, + "num_input_tokens_seen": 6352095, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 3.67578125, + "step": 301, + "time_per_iteration": 3.042797088623047 + }, + { + "auxiliary_loss_clip": 0.01679061, + "auxiliary_loss_mlp": 0.01531644, + "balance_loss_clip": 1.32582664, + "balance_loss_mlp": 1.15971065, + "epoch": 0.03631335297300547, + "flos": 13622565549600.0, + "grad_norm": 2.4824883062275234, + "language_loss": 0.89618772, + "learning_rate": 3.999589819366185e-06, + "loss": 0.92829478, + "num_input_tokens_seen": 6365885, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 3.71679688, + "step": 302, + "time_per_iteration": 2.9494094848632812 + }, + { + "auxiliary_loss_clip": 0.0167914, + "auxiliary_loss_mlp": 0.0154019, + "balance_loss_clip": 1.32672203, + "balance_loss_mlp": 1.17188048, + "epoch": 0.036433595863644565, + "flos": 27634012531200.0, + "grad_norm": 2.298742555184798, + "language_loss": 0.84890461, + "learning_rate": 3.999573892059547e-06, + "loss": 0.88109791, + "num_input_tokens_seen": 6385015, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 3.68164062, + "step": 303, + "time_per_iteration": 3.0691144466400146 + }, + { + "auxiliary_loss_clip": 0.01668534, + "auxiliary_loss_mlp": 0.01533802, + "balance_loss_clip": 1.31547832, + "balance_loss_mlp": 1.16244102, + "epoch": 0.036553838754283655, + "flos": 24574355189280.0, + "grad_norm": 2.983437516846064, + "language_loss": 0.8091808, + "learning_rate": 3.999557661419138e-06, + "loss": 0.84120417, + "num_input_tokens_seen": 6405165, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 3.71289062, + "step": 304, + "time_per_iteration": 2.970085620880127 + }, + { + "auxiliary_loss_clip": 0.01681898, + "auxiliary_loss_mlp": 0.01517359, + "balance_loss_clip": 1.32993889, + "balance_loss_mlp": 1.13817811, + "epoch": 0.036674081644922744, + "flos": 23406573412800.0, + "grad_norm": 2.1598300318370005, + "language_loss": 0.81603134, + "learning_rate": 3.9995411274474225e-06, + "loss": 0.84802389, + "num_input_tokens_seen": 6424445, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 3.79101562, + "step": 305, + "time_per_iteration": 3.02413272857666 + }, + { + "auxiliary_loss_clip": 0.01671753, + "auxiliary_loss_mlp": 0.01514066, + "balance_loss_clip": 1.31793249, + "balance_loss_mlp": 1.12954473, + "epoch": 0.036794324535561834, + "flos": 27492121827360.0, + "grad_norm": 2.787678394158433, + "language_loss": 0.81486571, + "learning_rate": 3.999524290146908e-06, + "loss": 0.84672385, + "num_input_tokens_seen": 6444650, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 3.84375, + "step": 306, + "time_per_iteration": 3.162830352783203 + }, + { + "auxiliary_loss_clip": 0.01680571, + "auxiliary_loss_mlp": 0.01525568, + "balance_loss_clip": 1.32735825, + "balance_loss_mlp": 1.14657831, + "epoch": 0.036914567426200924, + "flos": 19465494816960.0, + "grad_norm": 4.995657618688578, + "language_loss": 0.9266901, + "learning_rate": 3.9995071495201485e-06, + "loss": 0.95875144, + "num_input_tokens_seen": 6461755, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 3.78515625, + "step": 307, + "time_per_iteration": 3.157505989074707 + }, + { + "auxiliary_loss_clip": 0.01676044, + "auxiliary_loss_mlp": 0.01523965, + "balance_loss_clip": 1.32361674, + "balance_loss_mlp": 1.14783549, + "epoch": 0.037034810316840014, + "flos": 22311500647680.0, + "grad_norm": 3.20578153401246, + "language_loss": 0.98027229, + "learning_rate": 3.999489705569744e-06, + "loss": 1.01227236, + "num_input_tokens_seen": 6479455, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 3.7578125, + "step": 308, + "time_per_iteration": 3.0691823959350586 + }, + { + "auxiliary_loss_clip": 0.01672895, + "auxiliary_loss_mlp": 0.01501326, + "balance_loss_clip": 1.32029653, + "balance_loss_mlp": 1.11756802, + "epoch": 0.03715505320747911, + "flos": 18590331709440.0, + "grad_norm": 2.4505384765669898, + "language_loss": 0.86434817, + "learning_rate": 3.999471958298341e-06, + "loss": 0.89609045, + "num_input_tokens_seen": 6498365, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 3.83398438, + "step": 309, + "time_per_iteration": 3.0549824237823486 + }, + { + "auxiliary_loss_clip": 0.01674381, + "auxiliary_loss_mlp": 0.01526451, + "balance_loss_clip": 1.32250738, + "balance_loss_mlp": 1.13201189, + "epoch": 0.0372752960981182, + "flos": 35958302655840.0, + "grad_norm": 2.1892926473651992, + "language_loss": 0.76029366, + "learning_rate": 3.999453907708631e-06, + "loss": 0.79230201, + "num_input_tokens_seen": 6520770, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 3.94335938, + "step": 310, + "time_per_iteration": 3.1294615268707275 + }, + { + "auxiliary_loss_clip": 0.0167465, + "auxiliary_loss_mlp": 0.01520418, + "balance_loss_clip": 1.32167387, + "balance_loss_mlp": 1.13913918, + "epoch": 0.03739553898875729, + "flos": 20816168002560.0, + "grad_norm": 2.1334715811373983, + "language_loss": 0.81401926, + "learning_rate": 3.999435553803353e-06, + "loss": 0.84596997, + "num_input_tokens_seen": 6540170, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 3.80859375, + "step": 311, + "time_per_iteration": 3.0852150917053223 + }, + { + "auxiliary_loss_clip": 0.01674279, + "auxiliary_loss_mlp": 0.01520144, + "balance_loss_clip": 1.32304454, + "balance_loss_mlp": 1.1407721, + "epoch": 0.03751578187939638, + "flos": 20266090433280.0, + "grad_norm": 4.401781449944701, + "language_loss": 0.83525968, + "learning_rate": 3.999416896585292e-06, + "loss": 0.86720383, + "num_input_tokens_seen": 6557200, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 3.79296875, + "step": 312, + "time_per_iteration": 2.9795310497283936 + }, + { + "auxiliary_loss_clip": 0.01676333, + "auxiliary_loss_mlp": 0.01500296, + "balance_loss_clip": 1.32386923, + "balance_loss_mlp": 1.10528374, + "epoch": 0.03763602477003547, + "flos": 20670181057440.0, + "grad_norm": 3.5294456274283905, + "language_loss": 0.85885978, + "learning_rate": 3.9993979360572775e-06, + "loss": 0.89062607, + "num_input_tokens_seen": 6577340, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 3.94726562, + "step": 313, + "time_per_iteration": 3.1386096477508545 + }, + { + "auxiliary_loss_clip": 0.01686352, + "auxiliary_loss_mlp": 0.01525135, + "balance_loss_clip": 1.33467269, + "balance_loss_mlp": 1.14366555, + "epoch": 0.03775626766067456, + "flos": 16693259986080.0, + "grad_norm": 12.600127879932659, + "language_loss": 0.82925361, + "learning_rate": 3.999378672222185e-06, + "loss": 0.86136848, + "num_input_tokens_seen": 6595125, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 3.81445312, + "step": 314, + "time_per_iteration": 2.989097833633423 + }, + { + "auxiliary_loss_clip": 0.0168048, + "auxiliary_loss_mlp": 0.01526755, + "balance_loss_clip": 1.32807255, + "balance_loss_mlp": 1.1433785, + "epoch": 0.03787651055131366, + "flos": 21143908512000.0, + "grad_norm": 2.1127330295230538, + "language_loss": 0.83215082, + "learning_rate": 3.9993591050829385e-06, + "loss": 0.86422318, + "num_input_tokens_seen": 6612990, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 3.83203125, + "step": 315, + "time_per_iteration": 2.9277162551879883 + }, + { + "auxiliary_loss_clip": 0.01674327, + "auxiliary_loss_mlp": 0.0151304, + "balance_loss_clip": 1.32025576, + "balance_loss_mlp": 1.12699282, + "epoch": 0.037996753441952746, + "flos": 22020285320640.0, + "grad_norm": 1.9697887935179366, + "language_loss": 0.7909351, + "learning_rate": 3.999339234642506e-06, + "loss": 0.82280874, + "num_input_tokens_seen": 6632740, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 3.859375, + "step": 316, + "time_per_iteration": 3.7662415504455566 + }, + { + "auxiliary_loss_clip": 0.0167895, + "auxiliary_loss_mlp": 0.01540339, + "balance_loss_clip": 1.32638359, + "balance_loss_mlp": 1.15543604, + "epoch": 0.038116996332591836, + "flos": 27711879772320.0, + "grad_norm": 2.1121126467894067, + "language_loss": 0.83837485, + "learning_rate": 3.9993190609038994e-06, + "loss": 0.87056774, + "num_input_tokens_seen": 6651505, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 3.84765625, + "step": 317, + "time_per_iteration": 3.797349452972412 + }, + { + "auxiliary_loss_clip": 0.01673034, + "auxiliary_loss_mlp": 0.01529288, + "balance_loss_clip": 1.31944656, + "balance_loss_mlp": 1.14743662, + "epoch": 0.038237239223230926, + "flos": 21180168197280.0, + "grad_norm": 2.5066559438613942, + "language_loss": 0.8328594, + "learning_rate": 3.999298583870182e-06, + "loss": 0.86488253, + "num_input_tokens_seen": 6671090, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 3.81640625, + "step": 318, + "time_per_iteration": 4.7182440757751465 + }, + { + "auxiliary_loss_clip": 0.01673438, + "auxiliary_loss_mlp": 0.01528133, + "balance_loss_clip": 1.31955433, + "balance_loss_mlp": 1.14609098, + "epoch": 0.038357482113870016, + "flos": 25558980059520.0, + "grad_norm": 2.6144795394480482, + "language_loss": 0.77687979, + "learning_rate": 3.999277803544458e-06, + "loss": 0.80889547, + "num_input_tokens_seen": 6691245, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 3.8203125, + "step": 319, + "time_per_iteration": 2.9411516189575195 + }, + { + "auxiliary_loss_clip": 0.01708131, + "auxiliary_loss_mlp": 0.01485512, + "balance_loss_clip": 1.36101425, + "balance_loss_mlp": 1.1009903, + "epoch": 0.038477725004509106, + "flos": 59233567321440.0, + "grad_norm": 0.9692866099688519, + "language_loss": 0.62347168, + "learning_rate": 3.999256719929882e-06, + "loss": 0.65540814, + "num_input_tokens_seen": 6752520, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 3.84375, + "step": 320, + "time_per_iteration": 3.4783730506896973 + }, + { + "auxiliary_loss_clip": 0.0170457, + "auxiliary_loss_mlp": 0.01461899, + "balance_loss_clip": 1.35785663, + "balance_loss_mlp": 1.08729553, + "epoch": 0.0385979678951482, + "flos": 67323573015840.0, + "grad_norm": 1.2416004995624734, + "language_loss": 0.67121041, + "learning_rate": 3.999235333029651e-06, + "loss": 0.70287502, + "num_input_tokens_seen": 6806460, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 3.7421875, + "step": 321, + "time_per_iteration": 3.3070850372314453 + }, + { + "auxiliary_loss_clip": 0.01683906, + "auxiliary_loss_mlp": 0.0153694, + "balance_loss_clip": 1.33160782, + "balance_loss_mlp": 1.16596055, + "epoch": 0.03871821078578729, + "flos": 22748930488800.0, + "grad_norm": 2.041913365576678, + "language_loss": 0.81986475, + "learning_rate": 3.999213642847009e-06, + "loss": 0.85207319, + "num_input_tokens_seen": 6827045, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 3.7109375, + "step": 322, + "time_per_iteration": 3.0536766052246094 + }, + { + "auxiliary_loss_clip": 0.01667587, + "auxiliary_loss_mlp": 0.01536874, + "balance_loss_clip": 1.31408858, + "balance_loss_mlp": 1.16894662, + "epoch": 0.03883845367642638, + "flos": 26282694566880.0, + "grad_norm": 1.8324731212778727, + "language_loss": 0.9120878, + "learning_rate": 3.999191649385247e-06, + "loss": 0.94413233, + "num_input_tokens_seen": 6848220, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 3.6796875, + "step": 323, + "time_per_iteration": 3.105621337890625 + }, + { + "auxiliary_loss_clip": 0.01687231, + "auxiliary_loss_mlp": 0.01447914, + "balance_loss_clip": 1.3406378, + "balance_loss_mlp": 1.10382843, + "epoch": 0.03895869656706547, + "flos": 56968550874720.0, + "grad_norm": 0.9205886178744158, + "language_loss": 0.59731185, + "learning_rate": 3.999169352647702e-06, + "loss": 0.6286633, + "num_input_tokens_seen": 6909400, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 3.4453125, + "step": 324, + "time_per_iteration": 3.345431327819824 + }, + { + "auxiliary_loss_clip": 0.01669839, + "auxiliary_loss_mlp": 0.01551816, + "balance_loss_clip": 1.31601214, + "balance_loss_mlp": 1.1991477, + "epoch": 0.03907893945770456, + "flos": 24865684300800.0, + "grad_norm": 2.448724611287293, + "language_loss": 0.83109206, + "learning_rate": 3.999146752637755e-06, + "loss": 0.86330861, + "num_input_tokens_seen": 6930445, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 3.52734375, + "step": 325, + "time_per_iteration": 3.016319990158081 + }, + { + "auxiliary_loss_clip": 0.01665633, + "auxiliary_loss_mlp": 0.01581309, + "balance_loss_clip": 1.31110287, + "balance_loss_mlp": 1.22921252, + "epoch": 0.03919918234834365, + "flos": 18370535836320.0, + "grad_norm": 3.57551026074864, + "language_loss": 0.89941978, + "learning_rate": 3.999123849358836e-06, + "loss": 0.93188918, + "num_input_tokens_seen": 6948110, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 3.5234375, + "step": 326, + "time_per_iteration": 2.9970171451568604 + }, + { + "auxiliary_loss_clip": 0.01668574, + "auxiliary_loss_mlp": 0.01533432, + "balance_loss_clip": 1.31634378, + "balance_loss_mlp": 1.17733037, + "epoch": 0.03931942523898275, + "flos": 25227598446720.0, + "grad_norm": 2.130230580570211, + "language_loss": 0.74925464, + "learning_rate": 3.999100642814418e-06, + "loss": 0.78127468, + "num_input_tokens_seen": 6968550, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 3.5625, + "step": 327, + "time_per_iteration": 2.9963507652282715 + }, + { + "auxiliary_loss_clip": 0.01669007, + "auxiliary_loss_mlp": 0.01563318, + "balance_loss_clip": 1.31479812, + "balance_loss_mlp": 1.19996798, + "epoch": 0.03943966812962184, + "flos": 23260131329760.0, + "grad_norm": 3.011057645819103, + "language_loss": 0.88603389, + "learning_rate": 3.999077133008022e-06, + "loss": 0.91835713, + "num_input_tokens_seen": 6987135, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 3.6328125, + "step": 328, + "time_per_iteration": 3.045950174331665 + }, + { + "auxiliary_loss_clip": 0.01669917, + "auxiliary_loss_mlp": 0.01552648, + "balance_loss_clip": 1.31629062, + "balance_loss_mlp": 1.20207715, + "epoch": 0.03955991102026093, + "flos": 29171407870080.0, + "grad_norm": 2.9744869255590634, + "language_loss": 0.90774953, + "learning_rate": 3.9990533199432145e-06, + "loss": 0.93997514, + "num_input_tokens_seen": 7008630, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 3.5078125, + "step": 329, + "time_per_iteration": 3.04827880859375 + }, + { + "auxiliary_loss_clip": 0.01672796, + "auxiliary_loss_mlp": 0.01582956, + "balance_loss_clip": 1.31921959, + "balance_loss_mlp": 1.23658156, + "epoch": 0.03968015391090002, + "flos": 17604341425440.0, + "grad_norm": 6.684119297722163, + "language_loss": 0.75868386, + "learning_rate": 3.999029203623608e-06, + "loss": 0.79124141, + "num_input_tokens_seen": 7026350, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 3.46484375, + "step": 330, + "time_per_iteration": 2.966949462890625 + }, + { + "auxiliary_loss_clip": 0.01672175, + "auxiliary_loss_mlp": 0.01536658, + "balance_loss_clip": 1.31879258, + "balance_loss_mlp": 1.18265438, + "epoch": 0.03980039680153911, + "flos": 21801475579680.0, + "grad_norm": 2.5810977128182713, + "language_loss": 0.8717401, + "learning_rate": 3.99900478405286e-06, + "loss": 0.90382838, + "num_input_tokens_seen": 7045660, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 3.54296875, + "step": 331, + "time_per_iteration": 3.1128251552581787 + }, + { + "auxiliary_loss_clip": 0.0167409, + "auxiliary_loss_mlp": 0.01555179, + "balance_loss_clip": 1.32187366, + "balance_loss_mlp": 1.19678855, + "epoch": 0.0399206396921782, + "flos": 15196941208800.0, + "grad_norm": 2.850473754160327, + "language_loss": 0.82816505, + "learning_rate": 3.998980061234676e-06, + "loss": 0.86045778, + "num_input_tokens_seen": 7063575, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 3.58398438, + "step": 332, + "time_per_iteration": 2.9819183349609375 + }, + { + "auxiliary_loss_clip": 0.01671546, + "auxiliary_loss_mlp": 0.01543055, + "balance_loss_clip": 1.31799471, + "balance_loss_mlp": 1.18313861, + "epoch": 0.040040882582817294, + "flos": 14424336938880.0, + "grad_norm": 2.847657549672328, + "language_loss": 0.75954485, + "learning_rate": 3.9989550351728055e-06, + "loss": 0.79169095, + "num_input_tokens_seen": 7080505, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 3.59765625, + "step": 333, + "time_per_iteration": 2.9730117321014404 + }, + { + "auxiliary_loss_clip": 0.01676637, + "auxiliary_loss_mlp": 0.01525754, + "balance_loss_clip": 1.32550454, + "balance_loss_mlp": 1.16850758, + "epoch": 0.040161125473456384, + "flos": 19282906833120.0, + "grad_norm": 3.2300190646502167, + "language_loss": 0.84495896, + "learning_rate": 3.998929705871046e-06, + "loss": 0.87698287, + "num_input_tokens_seen": 7097860, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 3.57226562, + "step": 334, + "time_per_iteration": 2.9819846153259277 + }, + { + "auxiliary_loss_clip": 0.01674013, + "auxiliary_loss_mlp": 0.01541443, + "balance_loss_clip": 1.32199442, + "balance_loss_mlp": 1.17923713, + "epoch": 0.040281368364095474, + "flos": 17822999453760.0, + "grad_norm": 2.908998448202934, + "language_loss": 0.89754277, + "learning_rate": 3.99890407333324e-06, + "loss": 0.92969733, + "num_input_tokens_seen": 7116390, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 3.62109375, + "step": 335, + "time_per_iteration": 2.9227445125579834 + }, + { + "auxiliary_loss_clip": 0.01662779, + "auxiliary_loss_mlp": 0.01530173, + "balance_loss_clip": 1.30931902, + "balance_loss_mlp": 1.16701412, + "epoch": 0.040401611254734564, + "flos": 19575828927360.0, + "grad_norm": 1.8043640627806259, + "language_loss": 0.87168396, + "learning_rate": 3.998878137563275e-06, + "loss": 0.90361345, + "num_input_tokens_seen": 7135940, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 3.62890625, + "step": 336, + "time_per_iteration": 3.0440170764923096 + }, + { + "auxiliary_loss_clip": 0.01675605, + "auxiliary_loss_mlp": 0.01550721, + "balance_loss_clip": 1.32481861, + "balance_loss_mlp": 1.18546331, + "epoch": 0.040521854145373654, + "flos": 22056658790400.0, + "grad_norm": 2.816455970819335, + "language_loss": 0.85295618, + "learning_rate": 3.998851898565085e-06, + "loss": 0.8852194, + "num_input_tokens_seen": 7155745, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 3.6484375, + "step": 337, + "time_per_iteration": 2.9438648223876953 + }, + { + "auxiliary_loss_clip": 0.01664118, + "auxiliary_loss_mlp": 0.01519288, + "balance_loss_clip": 1.31179237, + "balance_loss_mlp": 1.14697409, + "epoch": 0.04064209703601274, + "flos": 22676866256160.0, + "grad_norm": 3.9366291193783067, + "language_loss": 0.8336094, + "learning_rate": 3.998825356342653e-06, + "loss": 0.86544347, + "num_input_tokens_seen": 7175920, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 3.71875, + "step": 338, + "time_per_iteration": 3.0064971446990967 + }, + { + "auxiliary_loss_clip": 0.01665532, + "auxiliary_loss_mlp": 0.01538109, + "balance_loss_clip": 1.31221342, + "balance_loss_mlp": 1.16732073, + "epoch": 0.04076233992665183, + "flos": 38585271176640.0, + "grad_norm": 3.192116301698439, + "language_loss": 0.73033488, + "learning_rate": 3.998798510900003e-06, + "loss": 0.7623713, + "num_input_tokens_seen": 7198720, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 3.70507812, + "step": 339, + "time_per_iteration": 3.0992257595062256 + }, + { + "auxiliary_loss_clip": 0.01665609, + "auxiliary_loss_mlp": 0.01534617, + "balance_loss_clip": 1.31251955, + "balance_loss_mlp": 1.16611731, + "epoch": 0.04088258281729093, + "flos": 25887706701120.0, + "grad_norm": 2.494264235299187, + "language_loss": 0.84214902, + "learning_rate": 3.998771362241207e-06, + "loss": 0.87415123, + "num_input_tokens_seen": 7219125, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 3.6796875, + "step": 340, + "time_per_iteration": 2.965557098388672 + }, + { + "auxiliary_loss_clip": 0.0166472, + "auxiliary_loss_mlp": 0.01504909, + "balance_loss_clip": 1.31343961, + "balance_loss_mlp": 1.12897038, + "epoch": 0.04100282570793002, + "flos": 19791756128160.0, + "grad_norm": 1.8385788930273577, + "language_loss": 0.87786174, + "learning_rate": 3.998743910370385e-06, + "loss": 0.90955806, + "num_input_tokens_seen": 7237985, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 3.75390625, + "step": 341, + "time_per_iteration": 2.9961154460906982 + }, + { + "auxiliary_loss_clip": 0.01677063, + "auxiliary_loss_mlp": 0.01515538, + "balance_loss_clip": 1.32568741, + "balance_loss_mlp": 1.1418879, + "epoch": 0.04112306859856911, + "flos": 22567366565280.0, + "grad_norm": 2.5865757029963943, + "language_loss": 0.7358849, + "learning_rate": 3.998716155291702e-06, + "loss": 0.76781082, + "num_input_tokens_seen": 7255825, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 3.734375, + "step": 342, + "time_per_iteration": 3.05784010887146 + }, + { + "auxiliary_loss_clip": 0.01671112, + "auxiliary_loss_mlp": 0.01491542, + "balance_loss_clip": 1.3184154, + "balance_loss_mlp": 1.10034442, + "epoch": 0.0412433114892082, + "flos": 25042696845120.0, + "grad_norm": 1.8674059909438228, + "language_loss": 0.90671706, + "learning_rate": 3.998688097009366e-06, + "loss": 0.93834352, + "num_input_tokens_seen": 7276590, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 3.91015625, + "step": 343, + "time_per_iteration": 3.151297092437744 + }, + { + "auxiliary_loss_clip": 0.01663482, + "auxiliary_loss_mlp": 0.0151352, + "balance_loss_clip": 1.31071079, + "balance_loss_mlp": 1.12384832, + "epoch": 0.04136355437984729, + "flos": 25193690307360.0, + "grad_norm": 2.1295909689368324, + "language_loss": 0.80032599, + "learning_rate": 3.998659735527636e-06, + "loss": 0.83209598, + "num_input_tokens_seen": 7295680, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 3.89257812, + "step": 344, + "time_per_iteration": 3.8722872734069824 + }, + { + "auxiliary_loss_clip": 0.01656046, + "auxiliary_loss_mlp": 0.01500259, + "balance_loss_clip": 1.30337691, + "balance_loss_mlp": 1.11802673, + "epoch": 0.04148379727048638, + "flos": 22969029787200.0, + "grad_norm": 1.7927881874569007, + "language_loss": 0.77984911, + "learning_rate": 3.998631070850813e-06, + "loss": 0.81141216, + "num_input_tokens_seen": 7316300, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 3.8203125, + "step": 345, + "time_per_iteration": 4.718621730804443 + }, + { + "auxiliary_loss_clip": 0.01668263, + "auxiliary_loss_mlp": 0.01500736, + "balance_loss_clip": 1.31631494, + "balance_loss_mlp": 1.11564255, + "epoch": 0.041604040161125476, + "flos": 14065191548640.0, + "grad_norm": 2.492825799986144, + "language_loss": 0.83808333, + "learning_rate": 3.9986021029832455e-06, + "loss": 0.86977333, + "num_input_tokens_seen": 7333615, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 3.84960938, + "step": 346, + "time_per_iteration": 3.8736155033111572 + }, + { + "auxiliary_loss_clip": 0.01664897, + "auxiliary_loss_mlp": 0.01520783, + "balance_loss_clip": 1.31032121, + "balance_loss_mlp": 1.14160252, + "epoch": 0.041724283051764566, + "flos": 12093400621440.0, + "grad_norm": 3.224687169671678, + "language_loss": 0.91668183, + "learning_rate": 3.9985728319293285e-06, + "loss": 0.94853866, + "num_input_tokens_seen": 7347590, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 3.78710938, + "step": 347, + "time_per_iteration": 3.0955135822296143 + }, + { + "auxiliary_loss_clip": 0.01659238, + "auxiliary_loss_mlp": 0.0154152, + "balance_loss_clip": 1.30527294, + "balance_loss_mlp": 1.18103075, + "epoch": 0.041844525942403656, + "flos": 12386929566240.0, + "grad_norm": 2.3518774705657837, + "language_loss": 0.85228074, + "learning_rate": 3.998543257693501e-06, + "loss": 0.88428831, + "num_input_tokens_seen": 7364345, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 3.60742188, + "step": 348, + "time_per_iteration": 3.0621731281280518 + }, + { + "auxiliary_loss_clip": 0.01664372, + "auxiliary_loss_mlp": 0.01495031, + "balance_loss_clip": 1.31280553, + "balance_loss_mlp": 1.10936475, + "epoch": 0.041964768833042745, + "flos": 23771635596000.0, + "grad_norm": 2.2345435053831015, + "language_loss": 0.87942529, + "learning_rate": 3.998513380280251e-06, + "loss": 0.91101927, + "num_input_tokens_seen": 7384625, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 3.85351562, + "step": 349, + "time_per_iteration": 2.979677438735962 + }, + { + "auxiliary_loss_clip": 0.01662952, + "auxiliary_loss_mlp": 0.01544564, + "balance_loss_clip": 1.30921614, + "balance_loss_mlp": 1.16118646, + "epoch": 0.042085011723681835, + "flos": 11876980354560.0, + "grad_norm": 4.256784549172828, + "language_loss": 0.95014846, + "learning_rate": 3.99848319969411e-06, + "loss": 0.98222363, + "num_input_tokens_seen": 7402225, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 3.83203125, + "step": 350, + "time_per_iteration": 3.0005455017089844 + }, + { + "auxiliary_loss_clip": 0.01667922, + "auxiliary_loss_mlp": 0.01543566, + "balance_loss_clip": 1.31309867, + "balance_loss_mlp": 1.1552304, + "epoch": 0.042205254614320925, + "flos": 16875885898080.0, + "grad_norm": 2.400016824208345, + "language_loss": 0.79684663, + "learning_rate": 3.9984527159396564e-06, + "loss": 0.82896149, + "num_input_tokens_seen": 7420865, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 3.8828125, + "step": 351, + "time_per_iteration": 3.044827699661255 + }, + { + "auxiliary_loss_clip": 0.01658819, + "auxiliary_loss_mlp": 0.01520974, + "balance_loss_clip": 1.30517554, + "balance_loss_mlp": 1.14465415, + "epoch": 0.04232549750496002, + "flos": 25120829583360.0, + "grad_norm": 2.420877898190216, + "language_loss": 0.84732652, + "learning_rate": 3.9984219290215154e-06, + "loss": 0.8791244, + "num_input_tokens_seen": 7441040, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 3.76367188, + "step": 352, + "time_per_iteration": 3.010295867919922 + }, + { + "auxiliary_loss_clip": 0.01663224, + "auxiliary_loss_mlp": 0.01502858, + "balance_loss_clip": 1.31072474, + "balance_loss_mlp": 1.12482131, + "epoch": 0.04244574039559911, + "flos": 26726989404960.0, + "grad_norm": 3.6503646993468366, + "language_loss": 0.89191842, + "learning_rate": 3.998390838944356e-06, + "loss": 0.92357922, + "num_input_tokens_seen": 7462545, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 3.77539062, + "step": 353, + "time_per_iteration": 3.1273610591888428 + }, + { + "auxiliary_loss_clip": 0.01665169, + "auxiliary_loss_mlp": 0.01524224, + "balance_loss_clip": 1.31228852, + "balance_loss_mlp": 1.1553427, + "epoch": 0.0425659832862382, + "flos": 20925364268160.0, + "grad_norm": 3.026513121013755, + "language_loss": 0.90236294, + "learning_rate": 3.998359445712895e-06, + "loss": 0.93425685, + "num_input_tokens_seen": 7481650, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 3.68554688, + "step": 354, + "time_per_iteration": 2.9438445568084717 + }, + { + "auxiliary_loss_clip": 0.01656389, + "auxiliary_loss_mlp": 0.01500401, + "balance_loss_clip": 1.30205679, + "balance_loss_mlp": 1.11359024, + "epoch": 0.04268622617687729, + "flos": 23333371335360.0, + "grad_norm": 2.3004387611577166, + "language_loss": 0.81382811, + "learning_rate": 3.9983277493318955e-06, + "loss": 0.84539598, + "num_input_tokens_seen": 7500945, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 3.86328125, + "step": 355, + "time_per_iteration": 2.9848992824554443 + }, + { + "auxiliary_loss_clip": 0.01655859, + "auxiliary_loss_mlp": 0.01525766, + "balance_loss_clip": 1.30281901, + "balance_loss_mlp": 1.15402389, + "epoch": 0.04280646906751638, + "flos": 25996371972480.0, + "grad_norm": 1.9285349344135254, + "language_loss": 0.81253088, + "learning_rate": 3.998295749806165e-06, + "loss": 0.84434712, + "num_input_tokens_seen": 7522170, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 3.71484375, + "step": 356, + "time_per_iteration": 2.964641571044922 + }, + { + "auxiliary_loss_clip": 0.0166607, + "auxiliary_loss_mlp": 0.01525047, + "balance_loss_clip": 1.31340444, + "balance_loss_mlp": 1.15540278, + "epoch": 0.04292671195815547, + "flos": 26909273963520.0, + "grad_norm": 2.235030561042832, + "language_loss": 0.83723068, + "learning_rate": 3.998263447140558e-06, + "loss": 0.86914182, + "num_input_tokens_seen": 7542370, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 3.69335938, + "step": 357, + "time_per_iteration": 2.9356467723846436 + }, + { + "auxiliary_loss_clip": 0.01654832, + "auxiliary_loss_mlp": 0.01513248, + "balance_loss_clip": 1.3012867, + "balance_loss_mlp": 1.13788188, + "epoch": 0.04304695484879457, + "flos": 39460434284160.0, + "grad_norm": 2.0753051058914056, + "language_loss": 0.82054842, + "learning_rate": 3.998230841339976e-06, + "loss": 0.85222924, + "num_input_tokens_seen": 7564380, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 3.75390625, + "step": 358, + "time_per_iteration": 3.087510585784912 + }, + { + "auxiliary_loss_clip": 0.01664415, + "auxiliary_loss_mlp": 0.0153451, + "balance_loss_clip": 1.31051755, + "balance_loss_mlp": 1.15437543, + "epoch": 0.04316719773943366, + "flos": 19648348297920.0, + "grad_norm": 2.4451660867045373, + "language_loss": 0.8542282, + "learning_rate": 3.998197932409363e-06, + "loss": 0.88621742, + "num_input_tokens_seen": 7582390, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 3.80078125, + "step": 359, + "time_per_iteration": 2.9704325199127197 + }, + { + "auxiliary_loss_clip": 0.01666745, + "auxiliary_loss_mlp": 0.01527478, + "balance_loss_clip": 1.31420708, + "balance_loss_mlp": 1.15993166, + "epoch": 0.04328744063007275, + "flos": 22454339555520.0, + "grad_norm": 3.1308717957270327, + "language_loss": 0.86489147, + "learning_rate": 3.9981647203537125e-06, + "loss": 0.89683372, + "num_input_tokens_seen": 7599890, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 3.671875, + "step": 360, + "time_per_iteration": 2.9402294158935547 + }, + { + "auxiliary_loss_clip": 0.01662446, + "auxiliary_loss_mlp": 0.01507088, + "balance_loss_clip": 1.30996454, + "balance_loss_mlp": 1.12142181, + "epoch": 0.04340768352071184, + "flos": 21284888940000.0, + "grad_norm": 5.507158046437318, + "language_loss": 0.96274781, + "learning_rate": 3.998131205178063e-06, + "loss": 0.99444312, + "num_input_tokens_seen": 7618360, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 3.85546875, + "step": 361, + "time_per_iteration": 3.024188995361328 + }, + { + "auxiliary_loss_clip": 0.01657044, + "auxiliary_loss_mlp": 0.01511566, + "balance_loss_clip": 1.30456936, + "balance_loss_mlp": 1.13772583, + "epoch": 0.04352792641135093, + "flos": 11585347817760.0, + "grad_norm": 8.66865417314428, + "language_loss": 0.77506208, + "learning_rate": 3.998097386887498e-06, + "loss": 0.80674827, + "num_input_tokens_seen": 7635435, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 3.73632812, + "step": 362, + "time_per_iteration": 3.0667498111724854 + }, + { + "auxiliary_loss_clip": 0.0165814, + "auxiliary_loss_mlp": 0.01522824, + "balance_loss_clip": 1.30608582, + "balance_loss_mlp": 1.15031838, + "epoch": 0.04364816930199002, + "flos": 23625838291680.0, + "grad_norm": 1.8134562572740038, + "language_loss": 0.85184336, + "learning_rate": 3.998063265487148e-06, + "loss": 0.88365293, + "num_input_tokens_seen": 7656485, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 3.72460938, + "step": 363, + "time_per_iteration": 2.9597535133361816 + }, + { + "auxiliary_loss_clip": 0.01658805, + "auxiliary_loss_mlp": 0.01525058, + "balance_loss_clip": 1.30634499, + "balance_loss_mlp": 1.15827489, + "epoch": 0.043768412192629114, + "flos": 14431543289280.0, + "grad_norm": 3.7856413846877066, + "language_loss": 0.80826318, + "learning_rate": 3.99802884098219e-06, + "loss": 0.84010184, + "num_input_tokens_seen": 7674595, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 3.6640625, + "step": 364, + "time_per_iteration": 2.9198427200317383 + }, + { + "auxiliary_loss_clip": 0.01657404, + "auxiliary_loss_mlp": 0.0152247, + "balance_loss_clip": 1.30443549, + "balance_loss_mlp": 1.15873873, + "epoch": 0.043888655083268203, + "flos": 26471313128160.0, + "grad_norm": 2.305611623745452, + "language_loss": 0.82443827, + "learning_rate": 3.997994113377845e-06, + "loss": 0.85623699, + "num_input_tokens_seen": 7693495, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 3.63671875, + "step": 365, + "time_per_iteration": 3.0079345703125 + }, + { + "auxiliary_loss_clip": 0.01657553, + "auxiliary_loss_mlp": 0.01516292, + "balance_loss_clip": 1.30425572, + "balance_loss_mlp": 1.14912724, + "epoch": 0.04400889797390729, + "flos": 27237735108000.0, + "grad_norm": 2.3265838588890633, + "language_loss": 0.83379054, + "learning_rate": 3.9979590826793815e-06, + "loss": 0.86552894, + "num_input_tokens_seen": 7714685, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 3.66796875, + "step": 366, + "time_per_iteration": 3.036660671234131 + }, + { + "auxiliary_loss_clip": 0.01659898, + "auxiliary_loss_mlp": 0.01515539, + "balance_loss_clip": 1.3061347, + "balance_loss_mlp": 1.14799249, + "epoch": 0.04412914086454638, + "flos": 20121544758240.0, + "grad_norm": 2.265627029168691, + "language_loss": 0.81253809, + "learning_rate": 3.997923748892113e-06, + "loss": 0.84429246, + "num_input_tokens_seen": 7734005, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 3.67382812, + "step": 367, + "time_per_iteration": 2.953183889389038 + }, + { + "auxiliary_loss_clip": 0.01659665, + "auxiliary_loss_mlp": 0.01535582, + "balance_loss_clip": 1.30641675, + "balance_loss_mlp": 1.1680361, + "epoch": 0.04424938375518547, + "flos": 22607001856800.0, + "grad_norm": 1.6929783452343328, + "language_loss": 0.88750148, + "learning_rate": 3.9978881120214015e-06, + "loss": 0.91945398, + "num_input_tokens_seen": 7755525, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 3.67382812, + "step": 368, + "time_per_iteration": 2.9613265991210938 + }, + { + "auxiliary_loss_clip": 0.01652498, + "auxiliary_loss_mlp": 0.01519613, + "balance_loss_clip": 1.29700685, + "balance_loss_mlp": 1.14024091, + "epoch": 0.04436962664582456, + "flos": 24134573802240.0, + "grad_norm": 2.6609239340949267, + "language_loss": 0.79796886, + "learning_rate": 3.997852172072652e-06, + "loss": 0.82968998, + "num_input_tokens_seen": 7776740, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 3.79101562, + "step": 369, + "time_per_iteration": 2.946967840194702 + }, + { + "auxiliary_loss_clip": 0.01653, + "auxiliary_loss_mlp": 0.01502446, + "balance_loss_clip": 1.29819131, + "balance_loss_mlp": 1.13394666, + "epoch": 0.04448986953646366, + "flos": 18224852316480.0, + "grad_norm": 3.4091444162740925, + "language_loss": 0.8956309, + "learning_rate": 3.9978159290513155e-06, + "loss": 0.92718542, + "num_input_tokens_seen": 7794820, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 3.6796875, + "step": 370, + "time_per_iteration": 2.9732210636138916 + }, + { + "auxiliary_loss_clip": 0.016491, + "auxiliary_loss_mlp": 0.01525155, + "balance_loss_clip": 1.29400516, + "balance_loss_mlp": 1.15608287, + "epoch": 0.04461011242710275, + "flos": 30120569546400.0, + "grad_norm": 2.298747943359223, + "language_loss": 0.80255687, + "learning_rate": 3.997779382962892e-06, + "loss": 0.83429945, + "num_input_tokens_seen": 7817705, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 3.68945312, + "step": 371, + "time_per_iteration": 3.8362977504730225 + }, + { + "auxiliary_loss_clip": 0.01651993, + "auxiliary_loss_mlp": 0.01531712, + "balance_loss_clip": 1.29646635, + "balance_loss_mlp": 1.16511953, + "epoch": 0.04473035531774184, + "flos": 29755128081600.0, + "grad_norm": 3.4660934949186943, + "language_loss": 0.7384907, + "learning_rate": 3.997742533812924e-06, + "loss": 0.77032775, + "num_input_tokens_seen": 7840970, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 3.66601562, + "step": 372, + "time_per_iteration": 2.945331573486328 + }, + { + "auxiliary_loss_clip": 0.01648802, + "auxiliary_loss_mlp": 0.01533562, + "balance_loss_clip": 1.29325044, + "balance_loss_mlp": 1.17173827, + "epoch": 0.04485059820838093, + "flos": 13153237761600.0, + "grad_norm": 3.3985330255221498, + "language_loss": 0.92636383, + "learning_rate": 3.997705381607001e-06, + "loss": 0.9581874, + "num_input_tokens_seen": 7857785, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 3.61328125, + "step": 373, + "time_per_iteration": 4.570287704467773 + }, + { + "auxiliary_loss_clip": 0.0166851, + "auxiliary_loss_mlp": 0.01710533, + "balance_loss_clip": 1.32144284, + "balance_loss_mlp": 1.41298676, + "epoch": 0.04497084109902002, + "flos": 68100728664960.0, + "grad_norm": 1.0956801570252674, + "language_loss": 0.60202444, + "learning_rate": 3.997667926350761e-06, + "loss": 0.63581485, + "num_input_tokens_seen": 7916115, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 2.96875, + "step": 374, + "time_per_iteration": 4.189523458480835 + }, + { + "auxiliary_loss_clip": 0.01661736, + "auxiliary_loss_mlp": 0.01551777, + "balance_loss_clip": 1.31445706, + "balance_loss_mlp": 1.23439407, + "epoch": 0.04509108398965911, + "flos": 64348913409120.0, + "grad_norm": 0.934030419322081, + "language_loss": 0.57738554, + "learning_rate": 3.997630168049886e-06, + "loss": 0.60952067, + "num_input_tokens_seen": 7974480, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 3.171875, + "step": 375, + "time_per_iteration": 3.379185914993286 + }, + { + "auxiliary_loss_clip": 0.01646045, + "auxiliary_loss_mlp": 0.01513173, + "balance_loss_clip": 1.28812099, + "balance_loss_mlp": 1.13074946, + "epoch": 0.045211326880298205, + "flos": 22273192841760.0, + "grad_norm": 2.547619474806646, + "language_loss": 0.77963471, + "learning_rate": 3.997592106710101e-06, + "loss": 0.8112269, + "num_input_tokens_seen": 7993940, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 3.8203125, + "step": 376, + "time_per_iteration": 2.9379868507385254 + }, + { + "auxiliary_loss_clip": 0.01644006, + "auxiliary_loss_mlp": 0.01498939, + "balance_loss_clip": 1.28753293, + "balance_loss_mlp": 1.11765981, + "epoch": 0.045331569770937295, + "flos": 32162111088480.0, + "grad_norm": 3.469171382125063, + "language_loss": 0.65910602, + "learning_rate": 3.997553742337182e-06, + "loss": 0.69053543, + "num_input_tokens_seen": 8013365, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 3.80859375, + "step": 377, + "time_per_iteration": 3.083859443664551 + }, + { + "auxiliary_loss_clip": 0.01644856, + "auxiliary_loss_mlp": 0.01491768, + "balance_loss_clip": 1.28741527, + "balance_loss_mlp": 1.09713745, + "epoch": 0.045451812661576385, + "flos": 22165627487040.0, + "grad_norm": 2.250282969865154, + "language_loss": 0.91553211, + "learning_rate": 3.997515074936949e-06, + "loss": 0.94689834, + "num_input_tokens_seen": 8034240, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 3.94726562, + "step": 378, + "time_per_iteration": 2.9125497341156006 + }, + { + "auxiliary_loss_clip": 0.01648898, + "auxiliary_loss_mlp": 0.01568443, + "balance_loss_clip": 1.29018617, + "balance_loss_mlp": 1.16179657, + "epoch": 0.045572055552215475, + "flos": 16583874079680.0, + "grad_norm": 2.3604785304509424, + "language_loss": 0.87356913, + "learning_rate": 3.997476104515268e-06, + "loss": 0.90574253, + "num_input_tokens_seen": 8052430, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 4.0703125, + "step": 379, + "time_per_iteration": 2.9542176723480225 + }, + { + "auxiliary_loss_clip": 0.01646478, + "auxiliary_loss_mlp": 0.01542555, + "balance_loss_clip": 1.28983951, + "balance_loss_mlp": 1.12904215, + "epoch": 0.045692298442854565, + "flos": 17605327557600.0, + "grad_norm": 2.578576276882306, + "language_loss": 0.77786255, + "learning_rate": 3.9974368310780485e-06, + "loss": 0.80975294, + "num_input_tokens_seen": 8069605, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 4.13867188, + "step": 380, + "time_per_iteration": 2.9856998920440674 + }, + { + "auxiliary_loss_clip": 0.01646228, + "auxiliary_loss_mlp": 0.01543793, + "balance_loss_clip": 1.287498, + "balance_loss_mlp": 1.13562083, + "epoch": 0.045812541333493655, + "flos": 26763552515520.0, + "grad_norm": 12.386972054087572, + "language_loss": 0.74189889, + "learning_rate": 3.997397254631251e-06, + "loss": 0.77379906, + "num_input_tokens_seen": 8090225, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 4.08398438, + "step": 381, + "time_per_iteration": 2.971203327178955 + }, + { + "auxiliary_loss_clip": 0.01644608, + "auxiliary_loss_mlp": 0.01670082, + "balance_loss_clip": 1.29546475, + "balance_loss_mlp": 1.20926666, + "epoch": 0.04593278422413275, + "flos": 60256462069440.0, + "grad_norm": 0.9809635921622941, + "language_loss": 0.60056674, + "learning_rate": 3.997357375180878e-06, + "loss": 0.63371366, + "num_input_tokens_seen": 8154505, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 4.59375, + "step": 382, + "time_per_iteration": 3.4928829669952393 + }, + { + "auxiliary_loss_clip": 0.01639374, + "auxiliary_loss_mlp": 0.01531721, + "balance_loss_clip": 1.28125548, + "balance_loss_mlp": 1.11630082, + "epoch": 0.04605302711477184, + "flos": 21801437651520.0, + "grad_norm": 3.7828585171583518, + "language_loss": 0.75506902, + "learning_rate": 3.997317192732979e-06, + "loss": 0.78678, + "num_input_tokens_seen": 8173285, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 4.16015625, + "step": 383, + "time_per_iteration": 2.9946744441986084 + }, + { + "auxiliary_loss_clip": 0.01635288, + "auxiliary_loss_mlp": 0.01517167, + "balance_loss_clip": 1.27630424, + "balance_loss_mlp": 1.1063236, + "epoch": 0.04617327000541093, + "flos": 19461512360160.0, + "grad_norm": 2.5605432433659048, + "language_loss": 0.82719147, + "learning_rate": 3.99727670729365e-06, + "loss": 0.85871607, + "num_input_tokens_seen": 8191845, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 4.11132812, + "step": 384, + "time_per_iteration": 2.937809467315674 + }, + { + "auxiliary_loss_clip": 0.01642528, + "auxiliary_loss_mlp": 0.0152098, + "balance_loss_clip": 1.283692, + "balance_loss_mlp": 1.10937381, + "epoch": 0.04629351289605002, + "flos": 25413865462080.0, + "grad_norm": 3.576690644420031, + "language_loss": 0.7798872, + "learning_rate": 3.997235918869033e-06, + "loss": 0.81152225, + "num_input_tokens_seen": 8212880, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 4.12109375, + "step": 385, + "time_per_iteration": 2.995666027069092 + }, + { + "auxiliary_loss_clip": 0.01638299, + "auxiliary_loss_mlp": 0.01527803, + "balance_loss_clip": 1.27891922, + "balance_loss_mlp": 1.1184864, + "epoch": 0.04641375578668911, + "flos": 20560377941280.0, + "grad_norm": 5.245441199022422, + "language_loss": 0.82756138, + "learning_rate": 3.997194827465315e-06, + "loss": 0.85922241, + "num_input_tokens_seen": 8231475, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 4.09375, + "step": 386, + "time_per_iteration": 2.9508790969848633 + }, + { + "auxiliary_loss_clip": 0.0163687, + "auxiliary_loss_mlp": 0.01524829, + "balance_loss_clip": 1.27667117, + "balance_loss_mlp": 1.12142491, + "epoch": 0.0465339986773282, + "flos": 13190369794560.0, + "grad_norm": 2.9437628457266496, + "language_loss": 0.91594869, + "learning_rate": 3.997153433088728e-06, + "loss": 0.94756567, + "num_input_tokens_seen": 8248600, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 4.03515625, + "step": 387, + "time_per_iteration": 3.069322109222412 + }, + { + "auxiliary_loss_clip": 0.01644814, + "auxiliary_loss_mlp": 0.01507491, + "balance_loss_clip": 1.28436756, + "balance_loss_mlp": 1.11476827, + "epoch": 0.0466542415679673, + "flos": 25558904203200.0, + "grad_norm": 2.5185279826742493, + "language_loss": 0.80918932, + "learning_rate": 3.997111735745554e-06, + "loss": 0.84071237, + "num_input_tokens_seen": 8271570, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 3.92773438, + "step": 388, + "time_per_iteration": 3.072360038757324 + }, + { + "auxiliary_loss_clip": 0.01646547, + "auxiliary_loss_mlp": 0.01511002, + "balance_loss_clip": 1.2867403, + "balance_loss_mlp": 1.12056816, + "epoch": 0.04677448445860639, + "flos": 22238943348960.0, + "grad_norm": 2.50809928445715, + "language_loss": 0.8282178, + "learning_rate": 3.997069735442118e-06, + "loss": 0.85979331, + "num_input_tokens_seen": 8291265, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 3.90234375, + "step": 389, + "time_per_iteration": 3.114248752593994 + }, + { + "auxiliary_loss_clip": 0.01632068, + "auxiliary_loss_mlp": 0.01519352, + "balance_loss_clip": 1.27196074, + "balance_loss_mlp": 1.13120651, + "epoch": 0.04689472734924548, + "flos": 28150371601920.0, + "grad_norm": 1.6544835740062571, + "language_loss": 0.80409515, + "learning_rate": 3.997027432184792e-06, + "loss": 0.83560932, + "num_input_tokens_seen": 8315925, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 3.88085938, + "step": 390, + "time_per_iteration": 3.200808048248291 + }, + { + "auxiliary_loss_clip": 0.01640528, + "auxiliary_loss_mlp": 0.01518255, + "balance_loss_clip": 1.28086936, + "balance_loss_mlp": 1.12858367, + "epoch": 0.04701497023988457, + "flos": 23151200561280.0, + "grad_norm": 2.58838247498375, + "language_loss": 0.89499211, + "learning_rate": 3.99698482597999e-06, + "loss": 0.92657995, + "num_input_tokens_seen": 8333605, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 3.89648438, + "step": 391, + "time_per_iteration": 2.9600987434387207 + }, + { + "auxiliary_loss_clip": 0.01672686, + "auxiliary_loss_mlp": 0.01526085, + "balance_loss_clip": 1.31818759, + "balance_loss_mlp": 1.10494232, + "epoch": 0.04713521313052366, + "flos": 64834967515680.0, + "grad_norm": 0.8764397697267703, + "language_loss": 0.63850957, + "learning_rate": 3.99694191683418e-06, + "loss": 0.6704973, + "num_input_tokens_seen": 8394405, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 4.21875, + "step": 392, + "time_per_iteration": 3.493544101715088 + }, + { + "auxiliary_loss_clip": 0.01638609, + "auxiliary_loss_mlp": 0.01562784, + "balance_loss_clip": 1.27683127, + "balance_loss_mlp": 1.18703651, + "epoch": 0.047255456021162746, + "flos": 18773640328320.0, + "grad_norm": 2.881754764855062, + "language_loss": 0.81800854, + "learning_rate": 3.996898704753867e-06, + "loss": 0.85002249, + "num_input_tokens_seen": 8412355, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 3.75585938, + "step": 393, + "time_per_iteration": 2.9423792362213135 + }, + { + "auxiliary_loss_clip": 0.01641432, + "auxiliary_loss_mlp": 0.01510185, + "balance_loss_clip": 1.27753091, + "balance_loss_mlp": 1.13672662, + "epoch": 0.04737569891180184, + "flos": 22055976083520.0, + "grad_norm": 2.202248555088427, + "language_loss": 0.8772344, + "learning_rate": 3.996855189745609e-06, + "loss": 0.90875053, + "num_input_tokens_seen": 8431620, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.73242188, + "step": 394, + "time_per_iteration": 2.9568912982940674 + }, + { + "auxiliary_loss_clip": 0.01634445, + "auxiliary_loss_mlp": 0.01514905, + "balance_loss_clip": 1.27247274, + "balance_loss_mlp": 1.14220881, + "epoch": 0.04749594180244093, + "flos": 29059822130400.0, + "grad_norm": 2.0133531485155056, + "language_loss": 0.92712963, + "learning_rate": 3.996811371816007e-06, + "loss": 0.95862305, + "num_input_tokens_seen": 8454045, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 3.72265625, + "step": 395, + "time_per_iteration": 3.0177741050720215 + }, + { + "auxiliary_loss_clip": 0.01641847, + "auxiliary_loss_mlp": 0.01557131, + "balance_loss_clip": 1.27793944, + "balance_loss_mlp": 1.18882239, + "epoch": 0.04761618469308002, + "flos": 35114961638880.0, + "grad_norm": 2.524132024842604, + "language_loss": 0.78147435, + "learning_rate": 3.996767250971707e-06, + "loss": 0.81346411, + "num_input_tokens_seen": 8476785, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 3.68164062, + "step": 396, + "time_per_iteration": 3.0378947257995605 + }, + { + "auxiliary_loss_clip": 0.01644308, + "auxiliary_loss_mlp": 0.01562046, + "balance_loss_clip": 1.2820704, + "balance_loss_mlp": 1.19850492, + "epoch": 0.04773642758371911, + "flos": 25633130340960.0, + "grad_norm": 1.9723294290262643, + "language_loss": 0.87007463, + "learning_rate": 3.996722827219403e-06, + "loss": 0.90213817, + "num_input_tokens_seen": 8498400, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 3.63476562, + "step": 397, + "time_per_iteration": 2.9411182403564453 + }, + { + "auxiliary_loss_clip": 0.01648437, + "auxiliary_loss_mlp": 0.01541647, + "balance_loss_clip": 1.28516448, + "balance_loss_mlp": 1.18134916, + "epoch": 0.0478566704743582, + "flos": 20633617946880.0, + "grad_norm": 4.508909230533819, + "language_loss": 0.82664794, + "learning_rate": 3.996678100565833e-06, + "loss": 0.85854876, + "num_input_tokens_seen": 8517455, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 3.6015625, + "step": 398, + "time_per_iteration": 2.978086471557617 + }, + { + "auxiliary_loss_clip": 0.01641768, + "auxiliary_loss_mlp": 0.01517739, + "balance_loss_clip": 1.27683914, + "balance_loss_mlp": 1.15076542, + "epoch": 0.04797691336499729, + "flos": 18837284509440.0, + "grad_norm": 2.465218473656089, + "language_loss": 0.8855682, + "learning_rate": 3.996633071017783e-06, + "loss": 0.91716325, + "num_input_tokens_seen": 8534085, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.66796875, + "step": 399, + "time_per_iteration": 3.777329683303833 + }, + { + "auxiliary_loss_clip": 0.01648935, + "auxiliary_loss_mlp": 0.01567609, + "balance_loss_clip": 1.28400922, + "balance_loss_mlp": 1.21684802, + "epoch": 0.04809715625563638, + "flos": 21101466536640.0, + "grad_norm": 2.692005254651529, + "language_loss": 0.81759751, + "learning_rate": 3.996587738582084e-06, + "loss": 0.84976292, + "num_input_tokens_seen": 8550885, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.50976562, + "step": 400, + "time_per_iteration": 3.993039846420288 + }, + { + "auxiliary_loss_clip": 0.01636588, + "auxiliary_loss_mlp": 0.01529822, + "balance_loss_clip": 1.27234817, + "balance_loss_mlp": 1.17085898, + "epoch": 0.04821739914627548, + "flos": 23807933209440.0, + "grad_norm": 2.769425276371784, + "language_loss": 0.86378425, + "learning_rate": 3.9965421032656115e-06, + "loss": 0.89544839, + "num_input_tokens_seen": 8570815, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.58789062, + "step": 401, + "time_per_iteration": 4.728279113769531 + }, + { + "auxiliary_loss_clip": 0.01639005, + "auxiliary_loss_mlp": 0.01539974, + "balance_loss_clip": 1.27424932, + "balance_loss_mlp": 1.18043864, + "epoch": 0.04833764203691457, + "flos": 22202797448160.0, + "grad_norm": 2.9272632306945408, + "language_loss": 0.94577259, + "learning_rate": 3.99649616507529e-06, + "loss": 0.97756243, + "num_input_tokens_seen": 8589910, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.59179688, + "step": 402, + "time_per_iteration": 3.016589641571045 + }, + { + "auxiliary_loss_clip": 0.0173198, + "auxiliary_loss_mlp": 0.01436256, + "balance_loss_clip": 1.37649238, + "balance_loss_mlp": 1.06012726, + "epoch": 0.04845788492755366, + "flos": 65910734847360.0, + "grad_norm": 0.9068621940605749, + "language_loss": 0.63113147, + "learning_rate": 3.996449924018088e-06, + "loss": 0.66281384, + "num_input_tokens_seen": 8650370, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 3.7578125, + "step": 403, + "time_per_iteration": 3.3920931816101074 + }, + { + "auxiliary_loss_clip": 0.01635598, + "auxiliary_loss_mlp": 0.01555123, + "balance_loss_clip": 1.2705282, + "balance_loss_mlp": 1.19291747, + "epoch": 0.04857812781819275, + "flos": 19283324042880.0, + "grad_norm": 2.042539073939419, + "language_loss": 0.7968424, + "learning_rate": 3.99640338010102e-06, + "loss": 0.82874966, + "num_input_tokens_seen": 8669475, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.61914062, + "step": 404, + "time_per_iteration": 2.969165325164795 + }, + { + "auxiliary_loss_clip": 0.01635537, + "auxiliary_loss_mlp": 0.01524368, + "balance_loss_clip": 1.2713393, + "balance_loss_mlp": 1.16254354, + "epoch": 0.04869837070883184, + "flos": 24064747331040.0, + "grad_norm": 2.0959387025209666, + "language_loss": 0.78997964, + "learning_rate": 3.996356533331146e-06, + "loss": 0.82157862, + "num_input_tokens_seen": 8691345, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.6171875, + "step": 405, + "time_per_iteration": 3.0116515159606934 + }, + { + "auxiliary_loss_clip": 0.01639267, + "auxiliary_loss_mlp": 0.01530497, + "balance_loss_clip": 1.27444458, + "balance_loss_mlp": 1.16562104, + "epoch": 0.04881861359947093, + "flos": 25189252712640.0, + "grad_norm": 3.3014429088656962, + "language_loss": 0.61724198, + "learning_rate": 3.996309383715573e-06, + "loss": 0.64893961, + "num_input_tokens_seen": 8710125, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.65039062, + "step": 406, + "time_per_iteration": 3.0772318840026855 + }, + { + "auxiliary_loss_clip": 0.01633481, + "auxiliary_loss_mlp": 0.01518778, + "balance_loss_clip": 1.26903594, + "balance_loss_mlp": 1.15352046, + "epoch": 0.048938856490110025, + "flos": 16364798841600.0, + "grad_norm": 4.014122368649661, + "language_loss": 0.73766983, + "learning_rate": 3.996261931261454e-06, + "loss": 0.76919246, + "num_input_tokens_seen": 8728705, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.65039062, + "step": 407, + "time_per_iteration": 2.9497461318969727 + }, + { + "auxiliary_loss_clip": 0.01650917, + "auxiliary_loss_mlp": 0.01509433, + "balance_loss_clip": 1.28820276, + "balance_loss_mlp": 1.14188731, + "epoch": 0.049059099380749115, + "flos": 29897511851520.0, + "grad_norm": 1.9644223781138006, + "language_loss": 0.86701947, + "learning_rate": 3.996214175975987e-06, + "loss": 0.89862299, + "num_input_tokens_seen": 8749225, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 3.671875, + "step": 408, + "time_per_iteration": 2.950841188430786 + }, + { + "auxiliary_loss_clip": 0.01644916, + "auxiliary_loss_mlp": 0.01509161, + "balance_loss_clip": 1.28117704, + "balance_loss_mlp": 1.13532031, + "epoch": 0.049179342271388204, + "flos": 35921815401600.0, + "grad_norm": 2.205666474774196, + "language_loss": 0.79118109, + "learning_rate": 3.996166117866417e-06, + "loss": 0.82272184, + "num_input_tokens_seen": 8771160, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 3.73632812, + "step": 409, + "time_per_iteration": 3.0597586631774902 + }, + { + "auxiliary_loss_clip": 0.01634015, + "auxiliary_loss_mlp": 0.01487231, + "balance_loss_clip": 1.2717036, + "balance_loss_mlp": 1.10805058, + "epoch": 0.049299585162027294, + "flos": 14612993428320.0, + "grad_norm": 2.0628919423795105, + "language_loss": 0.87016201, + "learning_rate": 3.996117756940035e-06, + "loss": 0.90137446, + "num_input_tokens_seen": 8787845, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 3.78710938, + "step": 410, + "time_per_iteration": 3.0143375396728516 + }, + { + "auxiliary_loss_clip": 0.01636444, + "auxiliary_loss_mlp": 0.01528212, + "balance_loss_clip": 1.27287543, + "balance_loss_mlp": 1.1537993, + "epoch": 0.049419828052666384, + "flos": 19569684565440.0, + "grad_norm": 2.321152419275784, + "language_loss": 0.97673428, + "learning_rate": 3.996069093204175e-06, + "loss": 1.00838089, + "num_input_tokens_seen": 8803805, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.7421875, + "step": 411, + "time_per_iteration": 2.9467029571533203 + }, + { + "auxiliary_loss_clip": 0.01644031, + "auxiliary_loss_mlp": 0.01530728, + "balance_loss_clip": 1.27980542, + "balance_loss_mlp": 1.15765083, + "epoch": 0.049540070943305474, + "flos": 13661480206080.0, + "grad_norm": 5.177257804549929, + "language_loss": 0.8824361, + "learning_rate": 3.996020126666221e-06, + "loss": 0.91418368, + "num_input_tokens_seen": 8820785, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.72851562, + "step": 412, + "time_per_iteration": 2.99625563621521 + }, + { + "auxiliary_loss_clip": 0.01635545, + "auxiliary_loss_mlp": 0.01512537, + "balance_loss_clip": 1.27259469, + "balance_loss_mlp": 1.14022243, + "epoch": 0.04966031383394457, + "flos": 21834018305280.0, + "grad_norm": 2.6530921463737522, + "language_loss": 0.8230741, + "learning_rate": 3.995970857333601e-06, + "loss": 0.85455495, + "num_input_tokens_seen": 8841195, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.72070312, + "step": 413, + "time_per_iteration": 2.9614343643188477 + }, + { + "auxiliary_loss_clip": 0.0163539, + "auxiliary_loss_mlp": 0.01513341, + "balance_loss_clip": 1.27127349, + "balance_loss_mlp": 1.13759363, + "epoch": 0.04978055672458366, + "flos": 28621178588160.0, + "grad_norm": 1.8515013974638241, + "language_loss": 0.79752213, + "learning_rate": 3.995921285213789e-06, + "loss": 0.82900941, + "num_input_tokens_seen": 8861455, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.75585938, + "step": 414, + "time_per_iteration": 3.0613551139831543 + }, + { + "auxiliary_loss_clip": 0.01644698, + "auxiliary_loss_mlp": 0.01500195, + "balance_loss_clip": 1.28071618, + "balance_loss_mlp": 1.11662745, + "epoch": 0.04990079961522275, + "flos": 19830253574880.0, + "grad_norm": 2.695538389353364, + "language_loss": 0.80632198, + "learning_rate": 3.995871410314305e-06, + "loss": 0.83777094, + "num_input_tokens_seen": 8880015, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.83007812, + "step": 415, + "time_per_iteration": 2.932831048965454 + }, + { + "auxiliary_loss_clip": 0.01723032, + "auxiliary_loss_mlp": 0.01426971, + "balance_loss_clip": 1.36833549, + "balance_loss_mlp": 1.0531311, + "epoch": 0.05002104250586184, + "flos": 62741501958240.0, + "grad_norm": 0.910973767074976, + "language_loss": 0.5959568, + "learning_rate": 3.995821232642714e-06, + "loss": 0.62745678, + "num_input_tokens_seen": 8938420, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 3.734375, + "step": 416, + "time_per_iteration": 3.4455835819244385 + }, + { + "auxiliary_loss_clip": 0.01641755, + "auxiliary_loss_mlp": 0.01516988, + "balance_loss_clip": 1.27680123, + "balance_loss_mlp": 1.14314771, + "epoch": 0.05014128539650093, + "flos": 27931030866720.0, + "grad_norm": 2.5190774421011723, + "language_loss": 0.82452208, + "learning_rate": 3.995770752206629e-06, + "loss": 0.85610956, + "num_input_tokens_seen": 8959495, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.734375, + "step": 417, + "time_per_iteration": 3.0197813510894775 + }, + { + "auxiliary_loss_clip": 0.0163953, + "auxiliary_loss_mlp": 0.01529074, + "balance_loss_clip": 1.27456629, + "balance_loss_mlp": 1.15218222, + "epoch": 0.05026152828714002, + "flos": 17707127832000.0, + "grad_norm": 2.0881584997895595, + "language_loss": 0.97259605, + "learning_rate": 3.995719969013709e-06, + "loss": 1.00428212, + "num_input_tokens_seen": 8976675, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.76953125, + "step": 418, + "time_per_iteration": 2.9671823978424072 + }, + { + "auxiliary_loss_clip": 0.01630345, + "auxiliary_loss_mlp": 0.01507546, + "balance_loss_clip": 1.26537704, + "balance_loss_mlp": 1.1216892, + "epoch": 0.05038177117777912, + "flos": 19135175192640.0, + "grad_norm": 3.086358237731346, + "language_loss": 0.85919255, + "learning_rate": 3.995668883071655e-06, + "loss": 0.89057148, + "num_input_tokens_seen": 8992900, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.85742188, + "step": 419, + "time_per_iteration": 2.9661293029785156 + }, + { + "auxiliary_loss_clip": 0.01640965, + "auxiliary_loss_mlp": 0.01540701, + "balance_loss_clip": 1.27554226, + "balance_loss_mlp": 1.15827811, + "epoch": 0.050502014068418206, + "flos": 20669801775840.0, + "grad_norm": 2.264391215654008, + "language_loss": 0.9091593, + "learning_rate": 3.995617494388219e-06, + "loss": 0.94097596, + "num_input_tokens_seen": 9011020, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.82226562, + "step": 420, + "time_per_iteration": 3.0491721630096436 + }, + { + "auxiliary_loss_clip": 0.01632779, + "auxiliary_loss_mlp": 0.01500028, + "balance_loss_clip": 1.26822805, + "balance_loss_mlp": 1.11760473, + "epoch": 0.050622256959057296, + "flos": 21363249247200.0, + "grad_norm": 2.1613037859829656, + "language_loss": 0.80667973, + "learning_rate": 3.995565802971196e-06, + "loss": 0.83800781, + "num_input_tokens_seen": 9030995, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.82226562, + "step": 421, + "time_per_iteration": 2.9840354919433594 + }, + { + "auxiliary_loss_clip": 0.01636366, + "auxiliary_loss_mlp": 0.01502561, + "balance_loss_clip": 1.27089024, + "balance_loss_mlp": 1.12242675, + "epoch": 0.050742499849696386, + "flos": 27676113153120.0, + "grad_norm": 2.405965028428713, + "language_loss": 0.67268217, + "learning_rate": 3.995513808828427e-06, + "loss": 0.7040714, + "num_input_tokens_seen": 9053790, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.79882812, + "step": 422, + "time_per_iteration": 3.219254493713379 + }, + { + "auxiliary_loss_clip": 0.01641226, + "auxiliary_loss_mlp": 0.01509802, + "balance_loss_clip": 1.27560163, + "balance_loss_mlp": 1.13100243, + "epoch": 0.050862742740335476, + "flos": 19868106242880.0, + "grad_norm": 2.3404007788577674, + "language_loss": 0.76561713, + "learning_rate": 3.9954615119678e-06, + "loss": 0.79712737, + "num_input_tokens_seen": 9072345, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.78515625, + "step": 423, + "time_per_iteration": 3.0278825759887695 + }, + { + "auxiliary_loss_clip": 0.01636413, + "auxiliary_loss_mlp": 0.01516636, + "balance_loss_clip": 1.27223969, + "balance_loss_mlp": 1.13726485, + "epoch": 0.050982985630974566, + "flos": 22086963754560.0, + "grad_norm": 2.0847717176923726, + "language_loss": 0.80958164, + "learning_rate": 3.995408912397248e-06, + "loss": 0.84111214, + "num_input_tokens_seen": 9090240, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.7890625, + "step": 424, + "time_per_iteration": 3.008049249649048 + }, + { + "auxiliary_loss_clip": 0.0163225, + "auxiliary_loss_mlp": 0.01532521, + "balance_loss_clip": 1.26818013, + "balance_loss_mlp": 1.16707289, + "epoch": 0.05110322852161366, + "flos": 20743079709600.0, + "grad_norm": 2.86944994009493, + "language_loss": 0.93500102, + "learning_rate": 3.99535601012475e-06, + "loss": 0.96664876, + "num_input_tokens_seen": 9105570, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.65234375, + "step": 425, + "time_per_iteration": 3.067417621612549 + }, + { + "auxiliary_loss_clip": 0.01639206, + "auxiliary_loss_mlp": 0.01518086, + "balance_loss_clip": 1.27392721, + "balance_loss_mlp": 1.13089418, + "epoch": 0.05122347141225275, + "flos": 28549759134240.0, + "grad_norm": 1.6681854935606117, + "language_loss": 0.75516772, + "learning_rate": 3.995302805158333e-06, + "loss": 0.7867406, + "num_input_tokens_seen": 9128225, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.87109375, + "step": 426, + "time_per_iteration": 3.813797950744629 + }, + { + "auxiliary_loss_clip": 0.01631198, + "auxiliary_loss_mlp": 0.01526799, + "balance_loss_clip": 1.26583183, + "balance_loss_mlp": 1.16135085, + "epoch": 0.05134371430289184, + "flos": 19724925981600.0, + "grad_norm": 2.5221092604102613, + "language_loss": 0.8349874, + "learning_rate": 3.9952492975060665e-06, + "loss": 0.86656737, + "num_input_tokens_seen": 9148295, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.65429688, + "step": 427, + "time_per_iteration": 2.9434728622436523 + }, + { + "auxiliary_loss_clip": 0.01632704, + "auxiliary_loss_mlp": 0.01499926, + "balance_loss_clip": 1.26796293, + "balance_loss_mlp": 1.12856531, + "epoch": 0.05146395719353093, + "flos": 34461794237760.0, + "grad_norm": 3.9713912562824705, + "language_loss": 0.85611397, + "learning_rate": 3.995195487176067e-06, + "loss": 0.8874402, + "num_input_tokens_seen": 9168525, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.70898438, + "step": 428, + "time_per_iteration": 3.8744075298309326 + }, + { + "auxiliary_loss_clip": 0.01636139, + "auxiliary_loss_mlp": 0.01521171, + "balance_loss_clip": 1.27132344, + "balance_loss_mlp": 1.14942825, + "epoch": 0.05158420008417002, + "flos": 21762333354240.0, + "grad_norm": 1.9680591954003288, + "language_loss": 0.85792094, + "learning_rate": 3.995141374176499e-06, + "loss": 0.88949406, + "num_input_tokens_seen": 9186920, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.71289062, + "step": 429, + "time_per_iteration": 3.955000877380371 + }, + { + "auxiliary_loss_clip": 0.01716344, + "auxiliary_loss_mlp": 0.01473915, + "balance_loss_clip": 1.36057425, + "balance_loss_mlp": 1.13211823, + "epoch": 0.05170444297480911, + "flos": 72561200584320.0, + "grad_norm": 0.8877365671281526, + "language_loss": 0.630723, + "learning_rate": 3.995086958515572e-06, + "loss": 0.66262555, + "num_input_tokens_seen": 9244940, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 3.421875, + "step": 430, + "time_per_iteration": 3.510624647140503 + }, + { + "auxiliary_loss_clip": 0.01716804, + "auxiliary_loss_mlp": 0.01463737, + "balance_loss_clip": 1.36195219, + "balance_loss_mlp": 1.11354828, + "epoch": 0.05182468586544821, + "flos": 62423129704320.0, + "grad_norm": 0.8829850757726138, + "language_loss": 0.59882551, + "learning_rate": 3.995032240201538e-06, + "loss": 0.63063097, + "num_input_tokens_seen": 9307335, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 3.5078125, + "step": 431, + "time_per_iteration": 3.4116809368133545 + }, + { + "auxiliary_loss_clip": 0.01714976, + "auxiliary_loss_mlp": 0.01450912, + "balance_loss_clip": 1.36056209, + "balance_loss_mlp": 1.09843445, + "epoch": 0.0519449287560873, + "flos": 41230179717120.0, + "grad_norm": 0.9530071102434652, + "language_loss": 0.63083899, + "learning_rate": 3.9949772192427e-06, + "loss": 0.66249788, + "num_input_tokens_seen": 9353960, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 3.53125, + "step": 432, + "time_per_iteration": 3.1051130294799805 + }, + { + "auxiliary_loss_clip": 0.01629334, + "auxiliary_loss_mlp": 0.01520998, + "balance_loss_clip": 1.26290429, + "balance_loss_mlp": 1.14181721, + "epoch": 0.05206517164672639, + "flos": 17496510573600.0, + "grad_norm": 1.881944525219484, + "language_loss": 0.79639125, + "learning_rate": 3.994921895647405e-06, + "loss": 0.82789463, + "num_input_tokens_seen": 9372130, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.78710938, + "step": 433, + "time_per_iteration": 3.010024070739746 + }, + { + "auxiliary_loss_clip": 0.01703139, + "auxiliary_loss_mlp": 0.01428467, + "balance_loss_clip": 1.34828711, + "balance_loss_mlp": 1.0592041, + "epoch": 0.05218541453736548, + "flos": 64008770391360.0, + "grad_norm": 0.8604622451176425, + "language_loss": 0.55320597, + "learning_rate": 3.994866269424043e-06, + "loss": 0.58452201, + "num_input_tokens_seen": 9428500, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 3.6875, + "step": 434, + "time_per_iteration": 3.3360040187835693 + }, + { + "auxiliary_loss_clip": 0.01625364, + "auxiliary_loss_mlp": 0.01493305, + "balance_loss_clip": 1.2593832, + "balance_loss_mlp": 1.11202621, + "epoch": 0.05230565742800457, + "flos": 19319318231040.0, + "grad_norm": 2.258959869536104, + "language_loss": 0.78412151, + "learning_rate": 3.9948103405810545e-06, + "loss": 0.81530821, + "num_input_tokens_seen": 9447450, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.80664062, + "step": 435, + "time_per_iteration": 2.9953665733337402 + }, + { + "auxiliary_loss_clip": 0.01624921, + "auxiliary_loss_mlp": 0.01491709, + "balance_loss_clip": 1.25963879, + "balance_loss_mlp": 1.10604334, + "epoch": 0.05242590031864366, + "flos": 25300838452320.0, + "grad_norm": 2.5932867194589444, + "language_loss": 0.85941505, + "learning_rate": 3.994754109126923e-06, + "loss": 0.89058137, + "num_input_tokens_seen": 9468945, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.8515625, + "step": 436, + "time_per_iteration": 3.0023250579833984 + }, + { + "auxiliary_loss_clip": 0.01626384, + "auxiliary_loss_mlp": 0.01495178, + "balance_loss_clip": 1.25992179, + "balance_loss_mlp": 1.10932159, + "epoch": 0.052546143209282754, + "flos": 26213930084160.0, + "grad_norm": 1.8309534240790943, + "language_loss": 0.93439269, + "learning_rate": 3.994697575070181e-06, + "loss": 0.96560836, + "num_input_tokens_seen": 9488405, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 3.85351562, + "step": 437, + "time_per_iteration": 2.9736552238464355 + }, + { + "auxiliary_loss_clip": 0.01628761, + "auxiliary_loss_mlp": 0.01496974, + "balance_loss_clip": 1.2641499, + "balance_loss_mlp": 1.11550391, + "epoch": 0.052666386099921844, + "flos": 22160014119360.0, + "grad_norm": 2.889971060001683, + "language_loss": 0.91487616, + "learning_rate": 3.994640738419402e-06, + "loss": 0.94613349, + "num_input_tokens_seen": 9507780, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.81054688, + "step": 438, + "time_per_iteration": 3.0002996921539307 + }, + { + "auxiliary_loss_clip": 0.01625682, + "auxiliary_loss_mlp": 0.01497527, + "balance_loss_clip": 1.25965858, + "balance_loss_mlp": 1.11224246, + "epoch": 0.052786628990560934, + "flos": 23884283324160.0, + "grad_norm": 2.271532422981742, + "language_loss": 0.80871975, + "learning_rate": 3.9945835991832075e-06, + "loss": 0.83995181, + "num_input_tokens_seen": 9529665, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.84960938, + "step": 439, + "time_per_iteration": 3.0333001613616943 + }, + { + "auxiliary_loss_clip": 0.01635298, + "auxiliary_loss_mlp": 0.01509928, + "balance_loss_clip": 1.27036929, + "balance_loss_mlp": 1.13131952, + "epoch": 0.052906871881200024, + "flos": 24607163412000.0, + "grad_norm": 6.689685592862531, + "language_loss": 0.93435287, + "learning_rate": 3.994526157370268e-06, + "loss": 0.96580505, + "num_input_tokens_seen": 9548280, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.78515625, + "step": 440, + "time_per_iteration": 2.986886501312256 + }, + { + "auxiliary_loss_clip": 0.01659781, + "auxiliary_loss_mlp": 0.01441101, + "balance_loss_clip": 1.30584788, + "balance_loss_mlp": 1.06268311, + "epoch": 0.053027114771839114, + "flos": 56467856134080.0, + "grad_norm": 0.9404886345268185, + "language_loss": 0.59217775, + "learning_rate": 3.994468412989296e-06, + "loss": 0.62318659, + "num_input_tokens_seen": 9609690, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 3.78125, + "step": 441, + "time_per_iteration": 3.529249429702759 + }, + { + "auxiliary_loss_clip": 0.01630234, + "auxiliary_loss_mlp": 0.01494905, + "balance_loss_clip": 1.26712394, + "balance_loss_mlp": 1.10962033, + "epoch": 0.053147357662478203, + "flos": 17313012313920.0, + "grad_norm": 2.3397271008008413, + "language_loss": 0.92703325, + "learning_rate": 3.994410366049052e-06, + "loss": 0.95828456, + "num_input_tokens_seen": 9627550, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 3.8515625, + "step": 442, + "time_per_iteration": 3.0655226707458496 + }, + { + "auxiliary_loss_clip": 0.01621612, + "auxiliary_loss_mlp": 0.01485419, + "balance_loss_clip": 1.25619197, + "balance_loss_mlp": 1.10528409, + "epoch": 0.0532676005531173, + "flos": 17166873656160.0, + "grad_norm": 8.473823475056454, + "language_loss": 0.83337563, + "learning_rate": 3.994352016558341e-06, + "loss": 0.86444598, + "num_input_tokens_seen": 9644855, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.79882812, + "step": 443, + "time_per_iteration": 2.9632885456085205 + }, + { + "auxiliary_loss_clip": 0.01626191, + "auxiliary_loss_mlp": 0.01505372, + "balance_loss_clip": 1.26168919, + "balance_loss_mlp": 1.1290524, + "epoch": 0.05338784344375639, + "flos": 27822517308000.0, + "grad_norm": 4.438837660762585, + "language_loss": 0.73994017, + "learning_rate": 3.994293364526014e-06, + "loss": 0.77125573, + "num_input_tokens_seen": 9665740, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.75976562, + "step": 444, + "time_per_iteration": 3.0426247119903564 + }, + { + "auxiliary_loss_clip": 0.01628821, + "auxiliary_loss_mlp": 0.01500598, + "balance_loss_clip": 1.26463389, + "balance_loss_mlp": 1.12485003, + "epoch": 0.05350808633439548, + "flos": 21509880971040.0, + "grad_norm": 2.8239051792824723, + "language_loss": 0.85006291, + "learning_rate": 3.99423440996097e-06, + "loss": 0.88135707, + "num_input_tokens_seen": 9685280, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.75390625, + "step": 445, + "time_per_iteration": 2.988996982574463 + }, + { + "auxiliary_loss_clip": 0.01635606, + "auxiliary_loss_mlp": 0.01520572, + "balance_loss_clip": 1.27028906, + "balance_loss_mlp": 1.15321708, + "epoch": 0.05362832922503457, + "flos": 20086384989600.0, + "grad_norm": 2.9702937267315574, + "language_loss": 0.81709367, + "learning_rate": 3.994175152872152e-06, + "loss": 0.84865546, + "num_input_tokens_seen": 9704365, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.671875, + "step": 446, + "time_per_iteration": 2.888666868209839 + }, + { + "auxiliary_loss_clip": 0.01620591, + "auxiliary_loss_mlp": 0.01492213, + "balance_loss_clip": 1.25605464, + "balance_loss_mlp": 1.11665654, + "epoch": 0.05374857211567366, + "flos": 26139438449280.0, + "grad_norm": 2.69226582833599, + "language_loss": 0.78901976, + "learning_rate": 3.994115593268548e-06, + "loss": 0.82014787, + "num_input_tokens_seen": 9724145, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.75, + "step": 447, + "time_per_iteration": 2.9634690284729004 + }, + { + "auxiliary_loss_clip": 0.01616276, + "auxiliary_loss_mlp": 0.0151455, + "balance_loss_clip": 1.25062716, + "balance_loss_mlp": 1.14128184, + "epoch": 0.05386881500631275, + "flos": 27489087574560.0, + "grad_norm": 2.356012270306804, + "language_loss": 0.82594669, + "learning_rate": 3.994055731159195e-06, + "loss": 0.85725492, + "num_input_tokens_seen": 9741615, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.72851562, + "step": 448, + "time_per_iteration": 2.9943883419036865 + }, + { + "auxiliary_loss_clip": 0.01630042, + "auxiliary_loss_mlp": 0.01514549, + "balance_loss_clip": 1.26514137, + "balance_loss_mlp": 1.14509523, + "epoch": 0.053989057896951846, + "flos": 23587264988640.0, + "grad_norm": 2.0437614312106853, + "language_loss": 0.87290597, + "learning_rate": 3.993995566553172e-06, + "loss": 0.90435195, + "num_input_tokens_seen": 9760580, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.69140625, + "step": 449, + "time_per_iteration": 2.9963653087615967 + }, + { + "auxiliary_loss_clip": 0.01618696, + "auxiliary_loss_mlp": 0.01517585, + "balance_loss_clip": 1.25444567, + "balance_loss_mlp": 1.14851308, + "epoch": 0.054109300787590936, + "flos": 25231239550080.0, + "grad_norm": 1.757103840769177, + "language_loss": 0.7713114, + "learning_rate": 3.993935099459607e-06, + "loss": 0.80267429, + "num_input_tokens_seen": 9782195, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.6875, + "step": 450, + "time_per_iteration": 3.0418410301208496 + }, + { + "auxiliary_loss_clip": 0.0161426, + "auxiliary_loss_mlp": 0.0151819, + "balance_loss_clip": 1.251616, + "balance_loss_mlp": 1.14511251, + "epoch": 0.054229543678230026, + "flos": 23843775684960.0, + "grad_norm": 2.519383249078448, + "language_loss": 0.73922074, + "learning_rate": 3.993874329887673e-06, + "loss": 0.77054513, + "num_input_tokens_seen": 9800850, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 3.7265625, + "step": 451, + "time_per_iteration": 2.925530195236206 + }, + { + "auxiliary_loss_clip": 0.01618531, + "auxiliary_loss_mlp": 0.01507236, + "balance_loss_clip": 1.25370789, + "balance_loss_mlp": 1.13816392, + "epoch": 0.054349786568869116, + "flos": 16322508578880.0, + "grad_norm": 3.1427958451939597, + "language_loss": 0.86331439, + "learning_rate": 3.993813257846589e-06, + "loss": 0.89457214, + "num_input_tokens_seen": 9817605, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.6875, + "step": 452, + "time_per_iteration": 3.040703058242798 + }, + { + "auxiliary_loss_clip": 0.01614982, + "auxiliary_loss_mlp": 0.01509983, + "balance_loss_clip": 1.24897599, + "balance_loss_mlp": 1.13862216, + "epoch": 0.054470029459508205, + "flos": 18662699367360.0, + "grad_norm": 2.254845089967026, + "language_loss": 0.92990708, + "learning_rate": 3.993751883345619e-06, + "loss": 0.96115679, + "num_input_tokens_seen": 9835965, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.70898438, + "step": 453, + "time_per_iteration": 3.751206159591675 + }, + { + "auxiliary_loss_clip": 0.01627358, + "auxiliary_loss_mlp": 0.01523386, + "balance_loss_clip": 1.262398, + "balance_loss_mlp": 1.15831947, + "epoch": 0.054590272350147295, + "flos": 17787232834560.0, + "grad_norm": 2.574064089820435, + "language_loss": 0.87780178, + "learning_rate": 3.993690206394073e-06, + "loss": 0.90930927, + "num_input_tokens_seen": 9852265, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.6484375, + "step": 454, + "time_per_iteration": 2.934614419937134 + }, + { + "auxiliary_loss_clip": 0.01625113, + "auxiliary_loss_mlp": 0.0155543, + "balance_loss_clip": 1.25911045, + "balance_loss_mlp": 1.19150758, + "epoch": 0.054710515240786385, + "flos": 17787915541440.0, + "grad_norm": 2.707507278280881, + "language_loss": 0.88158053, + "learning_rate": 3.993628227001307e-06, + "loss": 0.91338587, + "num_input_tokens_seen": 9870465, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.640625, + "step": 455, + "time_per_iteration": 3.0370304584503174 + }, + { + "auxiliary_loss_clip": 0.01616875, + "auxiliary_loss_mlp": 0.01520235, + "balance_loss_clip": 1.25132453, + "balance_loss_mlp": 1.14658523, + "epoch": 0.05483075813142548, + "flos": 48214341048960.0, + "grad_norm": 2.3437252371746626, + "language_loss": 0.71807468, + "learning_rate": 3.993565945176726e-06, + "loss": 0.7494458, + "num_input_tokens_seen": 9891490, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.73242188, + "step": 456, + "time_per_iteration": 4.9156787395477295 + }, + { + "auxiliary_loss_clip": 0.0161572, + "auxiliary_loss_mlp": 0.01518726, + "balance_loss_clip": 1.25213373, + "balance_loss_mlp": 1.14278746, + "epoch": 0.05495100102206457, + "flos": 19684266629760.0, + "grad_norm": 2.3395005293675677, + "language_loss": 0.83898556, + "learning_rate": 3.993503360929776e-06, + "loss": 0.87032998, + "num_input_tokens_seen": 9910375, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 3.7578125, + "step": 457, + "time_per_iteration": 3.9619977474212646 + }, + { + "auxiliary_loss_clip": 0.0161451, + "auxiliary_loss_mlp": 0.01494402, + "balance_loss_clip": 1.24981833, + "balance_loss_mlp": 1.11464846, + "epoch": 0.05507124391270366, + "flos": 26362496144160.0, + "grad_norm": 2.7195248196467876, + "language_loss": 0.81545973, + "learning_rate": 3.99344047426995e-06, + "loss": 0.8465488, + "num_input_tokens_seen": 9931635, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.79296875, + "step": 458, + "time_per_iteration": 3.0001027584075928 + }, + { + "auxiliary_loss_clip": 0.0161397, + "auxiliary_loss_mlp": 0.01503636, + "balance_loss_clip": 1.24650121, + "balance_loss_mlp": 1.12826979, + "epoch": 0.05519148680334275, + "flos": 22603512466080.0, + "grad_norm": 2.314338438627428, + "language_loss": 0.93593132, + "learning_rate": 3.993377285206789e-06, + "loss": 0.96710742, + "num_input_tokens_seen": 9951420, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.74609375, + "step": 459, + "time_per_iteration": 3.0109522342681885 + }, + { + "auxiliary_loss_clip": 0.01614813, + "auxiliary_loss_mlp": 0.01491184, + "balance_loss_clip": 1.25029397, + "balance_loss_mlp": 1.11638999, + "epoch": 0.05531172969398184, + "flos": 40555734618240.0, + "grad_norm": 1.8654306702820878, + "language_loss": 0.86547983, + "learning_rate": 3.99331379374988e-06, + "loss": 0.89653981, + "num_input_tokens_seen": 9975025, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.74414062, + "step": 460, + "time_per_iteration": 3.1077795028686523 + }, + { + "auxiliary_loss_clip": 0.01611685, + "auxiliary_loss_mlp": 0.01516188, + "balance_loss_clip": 1.24676669, + "balance_loss_mlp": 1.14330101, + "epoch": 0.05543197258462093, + "flos": 23480192700000.0, + "grad_norm": 4.426519870147872, + "language_loss": 0.80394399, + "learning_rate": 3.993249999908852e-06, + "loss": 0.83522266, + "num_input_tokens_seen": 9995175, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.7265625, + "step": 461, + "time_per_iteration": 2.971442937850952 + }, + { + "auxiliary_loss_clip": 0.01604066, + "auxiliary_loss_mlp": 0.01519198, + "balance_loss_clip": 1.23716259, + "balance_loss_mlp": 1.14402199, + "epoch": 0.05555221547526003, + "flos": 18626591394720.0, + "grad_norm": 1.9524293264977373, + "language_loss": 0.87324309, + "learning_rate": 3.993185903693384e-06, + "loss": 0.90447569, + "num_input_tokens_seen": 10011975, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.75195312, + "step": 462, + "time_per_iteration": 2.967041015625 + }, + { + "auxiliary_loss_clip": 0.01610782, + "auxiliary_loss_mlp": 0.01519364, + "balance_loss_clip": 1.24511886, + "balance_loss_mlp": 1.14018309, + "epoch": 0.05567245836589912, + "flos": 23589388965600.0, + "grad_norm": 3.373481796651164, + "language_loss": 0.82579041, + "learning_rate": 3.9931215051131995e-06, + "loss": 0.8570919, + "num_input_tokens_seen": 10032620, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 3.7890625, + "step": 463, + "time_per_iteration": 3.2095141410827637 + }, + { + "auxiliary_loss_clip": 0.01618314, + "auxiliary_loss_mlp": 0.01519616, + "balance_loss_clip": 1.25177836, + "balance_loss_mlp": 1.13623881, + "epoch": 0.05579270125653821, + "flos": 27749201446080.0, + "grad_norm": 1.9193760508611533, + "language_loss": 0.8026858, + "learning_rate": 3.993056804178068e-06, + "loss": 0.83406508, + "num_input_tokens_seen": 10054165, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.83007812, + "step": 464, + "time_per_iteration": 3.054391622543335 + }, + { + "auxiliary_loss_clip": 0.01610537, + "auxiliary_loss_mlp": 0.01514773, + "balance_loss_clip": 1.24417019, + "balance_loss_mlp": 1.13673651, + "epoch": 0.0559129441471773, + "flos": 27016497964800.0, + "grad_norm": 2.903839664890213, + "language_loss": 0.84496439, + "learning_rate": 3.992991800897803e-06, + "loss": 0.87621754, + "num_input_tokens_seen": 10073970, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.77929688, + "step": 465, + "time_per_iteration": 2.9597110748291016 + }, + { + "auxiliary_loss_clip": 0.01605059, + "auxiliary_loss_mlp": 0.01501438, + "balance_loss_clip": 1.23900366, + "balance_loss_mlp": 1.11596334, + "epoch": 0.05603318703781639, + "flos": 15231494126880.0, + "grad_norm": 2.482045328986496, + "language_loss": 0.90259278, + "learning_rate": 3.9929264952822665e-06, + "loss": 0.93365777, + "num_input_tokens_seen": 10091505, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.8515625, + "step": 466, + "time_per_iteration": 2.939115524291992 + }, + { + "auxiliary_loss_clip": 0.01612202, + "auxiliary_loss_mlp": 0.01522166, + "balance_loss_clip": 1.24542546, + "balance_loss_mlp": 1.14584565, + "epoch": 0.05615342992845548, + "flos": 22268300109120.0, + "grad_norm": 1.9189410253662431, + "language_loss": 0.88198984, + "learning_rate": 3.992860887341366e-06, + "loss": 0.91333354, + "num_input_tokens_seen": 10109675, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.75976562, + "step": 467, + "time_per_iteration": 2.9343554973602295 + }, + { + "auxiliary_loss_clip": 0.01619021, + "auxiliary_loss_mlp": 0.01498266, + "balance_loss_clip": 1.2544663, + "balance_loss_mlp": 1.11794066, + "epoch": 0.056273672819094574, + "flos": 23587151204160.0, + "grad_norm": 2.2810061497913674, + "language_loss": 0.81212151, + "learning_rate": 3.992794977085052e-06, + "loss": 0.84329438, + "num_input_tokens_seen": 10127675, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.79882812, + "step": 468, + "time_per_iteration": 3.10172438621521 + }, + { + "auxiliary_loss_clip": 0.01611618, + "auxiliary_loss_mlp": 0.01501699, + "balance_loss_clip": 1.24396276, + "balance_loss_mlp": 1.11546135, + "epoch": 0.056393915709733664, + "flos": 19860406826400.0, + "grad_norm": 3.272281912824114, + "language_loss": 0.84899724, + "learning_rate": 3.992728764523326e-06, + "loss": 0.88013041, + "num_input_tokens_seen": 10146620, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.859375, + "step": 469, + "time_per_iteration": 2.9966683387756348 + }, + { + "auxiliary_loss_clip": 0.01612787, + "auxiliary_loss_mlp": 0.01492137, + "balance_loss_clip": 1.24688721, + "balance_loss_mlp": 1.10952246, + "epoch": 0.05651415860037275, + "flos": 22165968840480.0, + "grad_norm": 2.4336233392701723, + "language_loss": 0.81033313, + "learning_rate": 3.99266224966623e-06, + "loss": 0.84138238, + "num_input_tokens_seen": 10167535, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.82617188, + "step": 470, + "time_per_iteration": 2.9593279361724854 + }, + { + "auxiliary_loss_clip": 0.01618502, + "auxiliary_loss_mlp": 0.0151029, + "balance_loss_clip": 1.25286627, + "balance_loss_mlp": 1.14083672, + "epoch": 0.05663440149101184, + "flos": 19465570673280.0, + "grad_norm": 2.1170004348916467, + "language_loss": 0.87766719, + "learning_rate": 3.992595432523855e-06, + "loss": 0.90895504, + "num_input_tokens_seen": 10184825, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.69335938, + "step": 471, + "time_per_iteration": 2.888307571411133 + }, + { + "auxiliary_loss_clip": 0.01612365, + "auxiliary_loss_mlp": 0.01500227, + "balance_loss_clip": 1.24447966, + "balance_loss_mlp": 1.11627817, + "epoch": 0.05675464438165093, + "flos": 22672201092480.0, + "grad_norm": 5.288723329348186, + "language_loss": 0.86229509, + "learning_rate": 3.992528313106338e-06, + "loss": 0.89342105, + "num_input_tokens_seen": 10203025, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.83789062, + "step": 472, + "time_per_iteration": 3.1488897800445557 + }, + { + "auxiliary_loss_clip": 0.01620705, + "auxiliary_loss_mlp": 0.01509861, + "balance_loss_clip": 1.25360775, + "balance_loss_mlp": 1.13277864, + "epoch": 0.05687488727229002, + "flos": 16902587687040.0, + "grad_norm": 11.040811098964369, + "language_loss": 0.82311141, + "learning_rate": 3.9924608914238595e-06, + "loss": 0.85441709, + "num_input_tokens_seen": 10218020, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.76757812, + "step": 473, + "time_per_iteration": 3.0276083946228027 + }, + { + "auxiliary_loss_clip": 0.0161685, + "auxiliary_loss_mlp": 0.0150265, + "balance_loss_clip": 1.24910963, + "balance_loss_mlp": 1.12251568, + "epoch": 0.05699513016292912, + "flos": 29171559582720.0, + "grad_norm": 4.671251956194121, + "language_loss": 0.83939797, + "learning_rate": 3.992393167486648e-06, + "loss": 0.87059295, + "num_input_tokens_seen": 10237170, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.79492188, + "step": 474, + "time_per_iteration": 3.081576347351074 + }, + { + "auxiliary_loss_clip": 0.01612158, + "auxiliary_loss_mlp": 0.01507868, + "balance_loss_clip": 1.24412811, + "balance_loss_mlp": 1.13498139, + "epoch": 0.05711537305356821, + "flos": 18918186003360.0, + "grad_norm": 2.3263828955236083, + "language_loss": 0.80930054, + "learning_rate": 3.992325141304977e-06, + "loss": 0.84050071, + "num_input_tokens_seen": 10255125, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.7265625, + "step": 475, + "time_per_iteration": 3.1044931411743164 + }, + { + "auxiliary_loss_clip": 0.0161802, + "auxiliary_loss_mlp": 0.01492827, + "balance_loss_clip": 1.2517879, + "balance_loss_mlp": 1.1199404, + "epoch": 0.0572356159442073, + "flos": 26761314754080.0, + "grad_norm": 2.9462619474976215, + "language_loss": 0.86924201, + "learning_rate": 3.992256812889166e-06, + "loss": 0.90035045, + "num_input_tokens_seen": 10271230, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.72851562, + "step": 476, + "time_per_iteration": 3.0183441638946533 + }, + { + "auxiliary_loss_clip": 0.01622867, + "auxiliary_loss_mlp": 0.01519259, + "balance_loss_clip": 1.25593531, + "balance_loss_mlp": 1.14389288, + "epoch": 0.05735585883484639, + "flos": 35119057880160.0, + "grad_norm": 3.6347588547009124, + "language_loss": 0.77101076, + "learning_rate": 3.992188182249582e-06, + "loss": 0.80243194, + "num_input_tokens_seen": 10293125, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.75195312, + "step": 477, + "time_per_iteration": 3.1734633445739746 + }, + { + "auxiliary_loss_clip": 0.01619882, + "auxiliary_loss_mlp": 0.01526688, + "balance_loss_clip": 1.25237465, + "balance_loss_mlp": 1.15876031, + "epoch": 0.05747610172548548, + "flos": 18736394510880.0, + "grad_norm": 2.132579068567865, + "language_loss": 0.9070065, + "learning_rate": 3.992119249396633e-06, + "loss": 0.93847215, + "num_input_tokens_seen": 10311810, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.67773438, + "step": 478, + "time_per_iteration": 3.0233378410339355 + }, + { + "auxiliary_loss_clip": 0.01616834, + "auxiliary_loss_mlp": 0.01524063, + "balance_loss_clip": 1.24909782, + "balance_loss_mlp": 1.15575457, + "epoch": 0.05759634461612457, + "flos": 27967252623840.0, + "grad_norm": 2.2253954589762444, + "language_loss": 0.82115221, + "learning_rate": 3.992050014340778e-06, + "loss": 0.85256124, + "num_input_tokens_seen": 10332165, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.68359375, + "step": 479, + "time_per_iteration": 3.078108072280884 + }, + { + "auxiliary_loss_clip": 0.01624221, + "auxiliary_loss_mlp": 0.01437332, + "balance_loss_clip": 1.2701745, + "balance_loss_mlp": 1.07493591, + "epoch": 0.057716587506763666, + "flos": 69298246483200.0, + "grad_norm": 0.8448019984375518, + "language_loss": 0.55016589, + "learning_rate": 3.99198047709252e-06, + "loss": 0.58078134, + "num_input_tokens_seen": 10393685, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 3.6171875, + "step": 480, + "time_per_iteration": 4.397336959838867 + }, + { + "auxiliary_loss_clip": 0.01611494, + "auxiliary_loss_mlp": 0.01507905, + "balance_loss_clip": 1.24319601, + "balance_loss_mlp": 1.13578176, + "epoch": 0.057836830397402755, + "flos": 25011519533280.0, + "grad_norm": 8.49413414940105, + "language_loss": 0.78745604, + "learning_rate": 3.991910637662408e-06, + "loss": 0.81865007, + "num_input_tokens_seen": 10413975, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.71679688, + "step": 481, + "time_per_iteration": 3.034850597381592 + }, + { + "auxiliary_loss_clip": 0.01615383, + "auxiliary_loss_mlp": 0.01515568, + "balance_loss_clip": 1.24706006, + "balance_loss_mlp": 1.14229965, + "epoch": 0.057957073288041845, + "flos": 25596301733280.0, + "grad_norm": 4.722611426516906, + "language_loss": 0.80993336, + "learning_rate": 3.9918404960610355e-06, + "loss": 0.84124279, + "num_input_tokens_seen": 10433005, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.72851562, + "step": 482, + "time_per_iteration": 2.9926369190216064 + }, + { + "auxiliary_loss_clip": 0.0161872, + "auxiliary_loss_mlp": 0.01507969, + "balance_loss_clip": 1.25084925, + "balance_loss_mlp": 1.13145852, + "epoch": 0.058077316178680935, + "flos": 20779529035680.0, + "grad_norm": 2.6975953284126497, + "language_loss": 0.77623415, + "learning_rate": 3.991770052299043e-06, + "loss": 0.80750108, + "num_input_tokens_seen": 10451235, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.76171875, + "step": 483, + "time_per_iteration": 3.9706358909606934 + }, + { + "auxiliary_loss_clip": 0.01614257, + "auxiliary_loss_mlp": 0.01498132, + "balance_loss_clip": 1.24549556, + "balance_loss_mlp": 1.1166625, + "epoch": 0.058197559069320025, + "flos": 18918489428640.0, + "grad_norm": 2.6401770828055637, + "language_loss": 0.87925148, + "learning_rate": 3.991699306387118e-06, + "loss": 0.91037536, + "num_input_tokens_seen": 10469705, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.8125, + "step": 484, + "time_per_iteration": 4.457256317138672 + }, + { + "auxiliary_loss_clip": 0.01616373, + "auxiliary_loss_mlp": 0.01547621, + "balance_loss_clip": 1.24794388, + "balance_loss_mlp": 1.17492485, + "epoch": 0.058317801959959115, + "flos": 24865646372640.0, + "grad_norm": 2.09794164855839, + "language_loss": 0.78147221, + "learning_rate": 3.991628258335991e-06, + "loss": 0.81311214, + "num_input_tokens_seen": 10491910, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.72460938, + "step": 485, + "time_per_iteration": 2.901153087615967 + }, + { + "auxiliary_loss_clip": 0.01615776, + "auxiliary_loss_mlp": 0.01508084, + "balance_loss_clip": 1.2482301, + "balance_loss_mlp": 1.13844037, + "epoch": 0.05843804485059821, + "flos": 23260207186080.0, + "grad_norm": 4.190177703837018, + "language_loss": 0.87983871, + "learning_rate": 3.991556908156442e-06, + "loss": 0.91107726, + "num_input_tokens_seen": 10508435, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.6953125, + "step": 486, + "time_per_iteration": 3.075747489929199 + }, + { + "auxiliary_loss_clip": 0.01618415, + "auxiliary_loss_mlp": 0.01534317, + "balance_loss_clip": 1.2494638, + "balance_loss_mlp": 1.16486335, + "epoch": 0.0585582877412373, + "flos": 23152869400320.0, + "grad_norm": 3.6130261777491803, + "language_loss": 0.87519729, + "learning_rate": 3.9914852558592914e-06, + "loss": 0.90672457, + "num_input_tokens_seen": 10529485, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.69335938, + "step": 487, + "time_per_iteration": 3.072246789932251 + }, + { + "auxiliary_loss_clip": 0.01620459, + "auxiliary_loss_mlp": 0.01534893, + "balance_loss_clip": 1.24947977, + "balance_loss_mlp": 1.170017, + "epoch": 0.05867853063187639, + "flos": 23508259902720.0, + "grad_norm": 3.38337667812192, + "language_loss": 0.80782831, + "learning_rate": 3.991413301455413e-06, + "loss": 0.83938181, + "num_input_tokens_seen": 10545935, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.6484375, + "step": 488, + "time_per_iteration": 2.9923338890075684 + }, + { + "auxiliary_loss_clip": 0.01610478, + "auxiliary_loss_mlp": 0.01505452, + "balance_loss_clip": 1.24159348, + "balance_loss_mlp": 1.13638043, + "epoch": 0.05879877352251548, + "flos": 29498313960000.0, + "grad_norm": 2.5935542769132027, + "language_loss": 0.77992702, + "learning_rate": 3.991341044955719e-06, + "loss": 0.8110863, + "num_input_tokens_seen": 10565690, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.6875, + "step": 489, + "time_per_iteration": 3.044487476348877 + }, + { + "auxiliary_loss_clip": 0.01608984, + "auxiliary_loss_mlp": 0.01492831, + "balance_loss_clip": 1.23830128, + "balance_loss_mlp": 1.1195631, + "epoch": 0.05891901641315457, + "flos": 20159587067040.0, + "grad_norm": 2.851217273159702, + "language_loss": 0.81885928, + "learning_rate": 3.991268486371172e-06, + "loss": 0.84987742, + "num_input_tokens_seen": 10584245, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.72851562, + "step": 490, + "time_per_iteration": 3.04490327835083 + }, + { + "auxiliary_loss_clip": 0.01615766, + "auxiliary_loss_mlp": 0.0150825, + "balance_loss_clip": 1.24700797, + "balance_loss_mlp": 1.14165735, + "epoch": 0.05903925930379366, + "flos": 24646495278240.0, + "grad_norm": 3.2973875032330175, + "language_loss": 0.87755752, + "learning_rate": 3.991195625712779e-06, + "loss": 0.90879768, + "num_input_tokens_seen": 10601210, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.6640625, + "step": 491, + "time_per_iteration": 3.044867753982544 + }, + { + "auxiliary_loss_clip": 0.0162272, + "auxiliary_loss_mlp": 0.01509051, + "balance_loss_clip": 1.25299513, + "balance_loss_mlp": 1.13788068, + "epoch": 0.05915950219443276, + "flos": 21252459998880.0, + "grad_norm": 2.3032955026905673, + "language_loss": 0.8132351, + "learning_rate": 3.991122462991592e-06, + "loss": 0.84455287, + "num_input_tokens_seen": 10620730, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.70898438, + "step": 492, + "time_per_iteration": 2.9892354011535645 + }, + { + "auxiliary_loss_clip": 0.01611879, + "auxiliary_loss_mlp": 0.01502006, + "balance_loss_clip": 1.24071562, + "balance_loss_mlp": 1.12740278, + "epoch": 0.05927974508507185, + "flos": 9904127438880.0, + "grad_norm": 5.4128602601939955, + "language_loss": 0.81687737, + "learning_rate": 3.991048998218712e-06, + "loss": 0.8480162, + "num_input_tokens_seen": 10634035, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.74414062, + "step": 493, + "time_per_iteration": 2.9369518756866455 + }, + { + "auxiliary_loss_clip": 0.01619417, + "auxiliary_loss_mlp": 0.01511425, + "balance_loss_clip": 1.25007629, + "balance_loss_mlp": 1.13663077, + "epoch": 0.05939998797571094, + "flos": 18261453355200.0, + "grad_norm": 2.787308038642145, + "language_loss": 0.76514697, + "learning_rate": 3.990975231405281e-06, + "loss": 0.79645538, + "num_input_tokens_seen": 10652485, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.74414062, + "step": 494, + "time_per_iteration": 3.0468766689300537 + }, + { + "auxiliary_loss_clip": 0.01613615, + "auxiliary_loss_mlp": 0.01501203, + "balance_loss_clip": 1.24473369, + "balance_loss_mlp": 1.12660003, + "epoch": 0.05952023086635003, + "flos": 28259112729600.0, + "grad_norm": 2.2881740768173975, + "language_loss": 0.78867704, + "learning_rate": 3.990901162562491e-06, + "loss": 0.81982523, + "num_input_tokens_seen": 10673175, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.74023438, + "step": 495, + "time_per_iteration": 3.0217862129211426 + }, + { + "auxiliary_loss_clip": 0.01606945, + "auxiliary_loss_mlp": 0.01491564, + "balance_loss_clip": 1.23731852, + "balance_loss_mlp": 1.11753321, + "epoch": 0.05964047375698912, + "flos": 14904246683520.0, + "grad_norm": 2.6014156228469223, + "language_loss": 0.90998203, + "learning_rate": 3.9908267917015765e-06, + "loss": 0.94096708, + "num_input_tokens_seen": 10691235, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.73632812, + "step": 496, + "time_per_iteration": 3.0351743698120117 + }, + { + "auxiliary_loss_clip": 0.01610292, + "auxiliary_loss_mlp": 0.01505243, + "balance_loss_clip": 1.23949277, + "balance_loss_mlp": 1.12777913, + "epoch": 0.059760716647628206, + "flos": 23187839528160.0, + "grad_norm": 2.5076083944778023, + "language_loss": 0.9292087, + "learning_rate": 3.990752118833821e-06, + "loss": 0.96036404, + "num_input_tokens_seen": 10708675, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.77148438, + "step": 497, + "time_per_iteration": 3.069174289703369 + }, + { + "auxiliary_loss_clip": 0.01617191, + "auxiliary_loss_mlp": 0.0150812, + "balance_loss_clip": 1.24666059, + "balance_loss_mlp": 1.13923883, + "epoch": 0.0598809595382673, + "flos": 22749158057760.0, + "grad_norm": 1.9242953556210265, + "language_loss": 0.77905488, + "learning_rate": 3.990677143970553e-06, + "loss": 0.81030804, + "num_input_tokens_seen": 10729485, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.6875, + "step": 498, + "time_per_iteration": 3.032106637954712 + }, + { + "auxiliary_loss_clip": 0.01614622, + "auxiliary_loss_mlp": 0.01506396, + "balance_loss_clip": 1.24519277, + "balance_loss_mlp": 1.1266427, + "epoch": 0.06000120242890639, + "flos": 22129405729920.0, + "grad_norm": 3.03489970541097, + "language_loss": 0.81029773, + "learning_rate": 3.990601867123144e-06, + "loss": 0.84150791, + "num_input_tokens_seen": 10749210, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.79492188, + "step": 499, + "time_per_iteration": 3.074662923812866 + }, + { + "auxiliary_loss_clip": 0.01617581, + "auxiliary_loss_mlp": 0.01499986, + "balance_loss_clip": 1.24761212, + "balance_loss_mlp": 1.11908889, + "epoch": 0.06012144531954548, + "flos": 19173786423840.0, + "grad_norm": 2.490731970134022, + "language_loss": 0.84855103, + "learning_rate": 3.990526288303014e-06, + "loss": 0.87972677, + "num_input_tokens_seen": 10768000, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.80664062, + "step": 500, + "time_per_iteration": 3.0313076972961426 + }, + { + "auxiliary_loss_clip": 0.01618668, + "auxiliary_loss_mlp": 0.01512035, + "balance_loss_clip": 1.24935627, + "balance_loss_mlp": 1.13266349, + "epoch": 0.06024168821018457, + "flos": 22785531527520.0, + "grad_norm": 1.91620910928942, + "language_loss": 0.91001177, + "learning_rate": 3.9904504075216295e-06, + "loss": 0.94131887, + "num_input_tokens_seen": 10788760, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.79101562, + "step": 501, + "time_per_iteration": 3.0920534133911133 + }, + { + "auxiliary_loss_clip": 0.01615596, + "auxiliary_loss_mlp": 0.01482441, + "balance_loss_clip": 1.24539053, + "balance_loss_mlp": 1.10955417, + "epoch": 0.06036193110082366, + "flos": 18772692124320.0, + "grad_norm": 2.340018547813443, + "language_loss": 0.93755233, + "learning_rate": 3.990374224790501e-06, + "loss": 0.96853274, + "num_input_tokens_seen": 10806965, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.7265625, + "step": 502, + "time_per_iteration": 3.0025649070739746 + }, + { + "auxiliary_loss_clip": 0.01613795, + "auxiliary_loss_mlp": 0.01509509, + "balance_loss_clip": 1.24354362, + "balance_loss_mlp": 1.13242602, + "epoch": 0.06048217399146275, + "flos": 17203474694880.0, + "grad_norm": 2.5028290580749113, + "language_loss": 0.7129159, + "learning_rate": 3.990297740121185e-06, + "loss": 0.74414897, + "num_input_tokens_seen": 10824900, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.76757812, + "step": 503, + "time_per_iteration": 2.9522202014923096 + }, + { + "auxiliary_loss_clip": 0.01611005, + "auxiliary_loss_mlp": 0.01510009, + "balance_loss_clip": 1.24134648, + "balance_loss_mlp": 1.13044667, + "epoch": 0.06060241688210185, + "flos": 24026401596960.0, + "grad_norm": 1.8051046338458503, + "language_loss": 0.78528082, + "learning_rate": 3.990220953525284e-06, + "loss": 0.81649095, + "num_input_tokens_seen": 10842010, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.79101562, + "step": 504, + "time_per_iteration": 2.999481678009033 + }, + { + "auxiliary_loss_clip": 0.01604066, + "auxiliary_loss_mlp": 0.01487857, + "balance_loss_clip": 1.23458815, + "balance_loss_mlp": 1.10638762, + "epoch": 0.06072265977274094, + "flos": 14612955500160.0, + "grad_norm": 2.511561822321938, + "language_loss": 0.73586965, + "learning_rate": 3.9901438650144465e-06, + "loss": 0.76678896, + "num_input_tokens_seen": 10858260, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.8125, + "step": 505, + "time_per_iteration": 3.04904842376709 + }, + { + "auxiliary_loss_clip": 0.01616435, + "auxiliary_loss_mlp": 0.01481465, + "balance_loss_clip": 1.24689007, + "balance_loss_mlp": 1.10228443, + "epoch": 0.06084290266338003, + "flos": 20560377941280.0, + "grad_norm": 5.419208177430198, + "language_loss": 0.92076421, + "learning_rate": 3.990066474600367e-06, + "loss": 0.95174325, + "num_input_tokens_seen": 10876230, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.78710938, + "step": 506, + "time_per_iteration": 2.919661045074463 + }, + { + "auxiliary_loss_clip": 0.01606914, + "auxiliary_loss_mlp": 0.01489408, + "balance_loss_clip": 1.23671889, + "balance_loss_mlp": 1.11175275, + "epoch": 0.06096314555401912, + "flos": 22311386863200.0, + "grad_norm": 2.1616961196681888, + "language_loss": 0.68134832, + "learning_rate": 3.989988782294786e-06, + "loss": 0.71231157, + "num_input_tokens_seen": 10896320, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.77734375, + "step": 507, + "time_per_iteration": 3.0070407390594482 + }, + { + "auxiliary_loss_clip": 0.01614617, + "auxiliary_loss_mlp": 0.01492929, + "balance_loss_clip": 1.24491966, + "balance_loss_mlp": 1.11241269, + "epoch": 0.06108338844465821, + "flos": 19133165000160.0, + "grad_norm": 2.29177138028897, + "language_loss": 0.95136327, + "learning_rate": 3.989910788109489e-06, + "loss": 0.98243868, + "num_input_tokens_seen": 10912970, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.80273438, + "step": 508, + "time_per_iteration": 3.818574905395508 + }, + { + "auxiliary_loss_clip": 0.01613554, + "auxiliary_loss_mlp": 0.01509152, + "balance_loss_clip": 1.24241531, + "balance_loss_mlp": 1.12958932, + "epoch": 0.0612036313352973, + "flos": 33586858699200.0, + "grad_norm": 2.5025480419339794, + "language_loss": 0.7514081, + "learning_rate": 3.989832492056307e-06, + "loss": 0.78263509, + "num_input_tokens_seen": 10933995, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.79296875, + "step": 509, + "time_per_iteration": 3.131214141845703 + }, + { + "auxiliary_loss_clip": 0.01612172, + "auxiliary_loss_mlp": 0.01513845, + "balance_loss_clip": 1.24159551, + "balance_loss_mlp": 1.1314218, + "epoch": 0.06132387422593639, + "flos": 27493032103200.0, + "grad_norm": 4.945477432361776, + "language_loss": 0.80773532, + "learning_rate": 3.989753894147119e-06, + "loss": 0.83899552, + "num_input_tokens_seen": 10954120, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.8203125, + "step": 510, + "time_per_iteration": 3.960765838623047 + }, + { + "auxiliary_loss_clip": 0.01624455, + "auxiliary_loss_mlp": 0.01501588, + "balance_loss_clip": 1.25424957, + "balance_loss_mlp": 1.12240696, + "epoch": 0.061444117116575485, + "flos": 25887441204000.0, + "grad_norm": 2.2349532344259555, + "language_loss": 0.80093402, + "learning_rate": 3.989674994393846e-06, + "loss": 0.83219445, + "num_input_tokens_seen": 10973595, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.7890625, + "step": 511, + "time_per_iteration": 3.979560613632202 + }, + { + "auxiliary_loss_clip": 0.01617419, + "auxiliary_loss_mlp": 0.01496674, + "balance_loss_clip": 1.24855936, + "balance_loss_mlp": 1.11882842, + "epoch": 0.061564360007214575, + "flos": 28514561437440.0, + "grad_norm": 2.0087125629132725, + "language_loss": 0.94180375, + "learning_rate": 3.98959579280846e-06, + "loss": 0.97294462, + "num_input_tokens_seen": 10991995, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.77539062, + "step": 512, + "time_per_iteration": 3.0497734546661377 + }, + { + "auxiliary_loss_clip": 0.01617522, + "auxiliary_loss_mlp": 0.01501195, + "balance_loss_clip": 1.24861634, + "balance_loss_mlp": 1.11972499, + "epoch": 0.061684602897853665, + "flos": 12096017664480.0, + "grad_norm": 2.240633626179099, + "language_loss": 0.83259827, + "learning_rate": 3.989516289402973e-06, + "loss": 0.86378545, + "num_input_tokens_seen": 11007625, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.8125, + "step": 513, + "time_per_iteration": 3.964385986328125 + }, + { + "auxiliary_loss_clip": 0.01623044, + "auxiliary_loss_mlp": 0.01489795, + "balance_loss_clip": 1.25352001, + "balance_loss_mlp": 1.12072349, + "epoch": 0.061804845788492754, + "flos": 19534448940480.0, + "grad_norm": 2.777841994201734, + "language_loss": 0.8048017, + "learning_rate": 3.989436484189447e-06, + "loss": 0.83593011, + "num_input_tokens_seen": 11025570, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.6875, + "step": 514, + "time_per_iteration": 2.9885783195495605 + }, + { + "auxiliary_loss_clip": 0.01608326, + "auxiliary_loss_mlp": 0.01507139, + "balance_loss_clip": 1.23761177, + "balance_loss_mlp": 1.1266228, + "epoch": 0.061925088679131844, + "flos": 15342928153920.0, + "grad_norm": 4.340933596013972, + "language_loss": 0.80696499, + "learning_rate": 3.9893563771799885e-06, + "loss": 0.83811963, + "num_input_tokens_seen": 11042045, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.80273438, + "step": 515, + "time_per_iteration": 2.893307685852051 + }, + { + "auxiliary_loss_clip": 0.01613112, + "auxiliary_loss_mlp": 0.01507152, + "balance_loss_clip": 1.24220693, + "balance_loss_mlp": 1.12911558, + "epoch": 0.062045331569770934, + "flos": 25922221691040.0, + "grad_norm": 2.0485779608066856, + "language_loss": 0.86688221, + "learning_rate": 3.989275968386749e-06, + "loss": 0.89808482, + "num_input_tokens_seen": 11059955, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.77734375, + "step": 516, + "time_per_iteration": 3.003796100616455 + }, + { + "auxiliary_loss_clip": 0.01623192, + "auxiliary_loss_mlp": 0.0149801, + "balance_loss_clip": 1.25435019, + "balance_loss_mlp": 1.12760258, + "epoch": 0.06216557446041003, + "flos": 28113125784480.0, + "grad_norm": 2.549058597127088, + "language_loss": 0.77020562, + "learning_rate": 3.989195257821926e-06, + "loss": 0.80141759, + "num_input_tokens_seen": 11078440, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.70507812, + "step": 517, + "time_per_iteration": 2.9750990867614746 + }, + { + "auxiliary_loss_clip": 0.01617571, + "auxiliary_loss_mlp": 0.01516718, + "balance_loss_clip": 1.24761248, + "balance_loss_mlp": 1.15107977, + "epoch": 0.06228581735104912, + "flos": 23480496125280.0, + "grad_norm": 2.5276743304415916, + "language_loss": 0.84787184, + "learning_rate": 3.989114245497765e-06, + "loss": 0.8792147, + "num_input_tokens_seen": 11098240, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.65429688, + "step": 518, + "time_per_iteration": 2.963679313659668 + }, + { + "auxiliary_loss_clip": 0.0160705, + "auxiliary_loss_mlp": 0.01504145, + "balance_loss_clip": 1.23717928, + "balance_loss_mlp": 1.1337378, + "epoch": 0.06240606024168821, + "flos": 15197017065120.0, + "grad_norm": 2.5861590981207527, + "language_loss": 0.94881952, + "learning_rate": 3.989032931426554e-06, + "loss": 0.97993147, + "num_input_tokens_seen": 11115395, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.70117188, + "step": 519, + "time_per_iteration": 2.9827494621276855 + }, + { + "auxiliary_loss_clip": 0.01612506, + "auxiliary_loss_mlp": 0.01494014, + "balance_loss_clip": 1.24382257, + "balance_loss_mlp": 1.12227178, + "epoch": 0.06252630313232731, + "flos": 20633959300320.0, + "grad_norm": 2.2847369334191376, + "language_loss": 0.86706781, + "learning_rate": 3.9889513156206295e-06, + "loss": 0.89813304, + "num_input_tokens_seen": 11134835, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.71289062, + "step": 520, + "time_per_iteration": 2.90078067779541 + }, + { + "auxiliary_loss_clip": 0.01620838, + "auxiliary_loss_mlp": 0.0149327, + "balance_loss_clip": 1.25122881, + "balance_loss_mlp": 1.12972951, + "epoch": 0.06264654602296639, + "flos": 20780211742560.0, + "grad_norm": 4.352792537819167, + "language_loss": 0.73710555, + "learning_rate": 3.988869398092371e-06, + "loss": 0.76824659, + "num_input_tokens_seen": 11154745, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.63476562, + "step": 521, + "time_per_iteration": 2.953077554702759 + }, + { + "auxiliary_loss_clip": 0.01612802, + "auxiliary_loss_mlp": 0.01516443, + "balance_loss_clip": 1.24223924, + "balance_loss_mlp": 1.15328431, + "epoch": 0.06276678891360549, + "flos": 29608344645120.0, + "grad_norm": 2.8673472511749076, + "language_loss": 0.79431552, + "learning_rate": 3.988787178854206e-06, + "loss": 0.82560802, + "num_input_tokens_seen": 11174280, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 3.63085938, + "step": 522, + "time_per_iteration": 2.9484267234802246 + }, + { + "auxiliary_loss_clip": 0.01609908, + "auxiliary_loss_mlp": 0.01500323, + "balance_loss_clip": 1.23998868, + "balance_loss_mlp": 1.1222862, + "epoch": 0.06288703180424457, + "flos": 22128419597760.0, + "grad_norm": 2.5326864901347244, + "language_loss": 0.88004994, + "learning_rate": 3.988704657918608e-06, + "loss": 0.91115224, + "num_input_tokens_seen": 11193340, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.77734375, + "step": 523, + "time_per_iteration": 2.9229884147644043 + }, + { + "auxiliary_loss_clip": 0.01623356, + "auxiliary_loss_mlp": 0.0151438, + "balance_loss_clip": 1.25301623, + "balance_loss_mlp": 1.1443541, + "epoch": 0.06300727469488367, + "flos": 14978510749440.0, + "grad_norm": 3.2735439655626433, + "language_loss": 0.79757303, + "learning_rate": 3.988621835298094e-06, + "loss": 0.82895041, + "num_input_tokens_seen": 11210555, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.69726562, + "step": 524, + "time_per_iteration": 2.939039945602417 + }, + { + "auxiliary_loss_clip": 0.01621436, + "auxiliary_loss_mlp": 0.01504862, + "balance_loss_clip": 1.2528863, + "balance_loss_mlp": 1.13903213, + "epoch": 0.06312751758552275, + "flos": 24537716222400.0, + "grad_norm": 2.1412032387550424, + "language_loss": 0.91833431, + "learning_rate": 3.988538711005229e-06, + "loss": 0.94959724, + "num_input_tokens_seen": 11230010, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.65625, + "step": 525, + "time_per_iteration": 3.0607681274414062 + }, + { + "auxiliary_loss_clip": 0.0161228, + "auxiliary_loss_mlp": 0.01498062, + "balance_loss_clip": 1.24278057, + "balance_loss_mlp": 1.12689173, + "epoch": 0.06324776047616185, + "flos": 21509160336000.0, + "grad_norm": 2.5039848520731556, + "language_loss": 0.88097906, + "learning_rate": 3.988455285052622e-06, + "loss": 0.91208243, + "num_input_tokens_seen": 11246190, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.70898438, + "step": 526, + "time_per_iteration": 2.9122917652130127 + }, + { + "auxiliary_loss_clip": 0.01623219, + "auxiliary_loss_mlp": 0.01497429, + "balance_loss_clip": 1.25277686, + "balance_loss_mlp": 1.1274035, + "epoch": 0.06336800336680094, + "flos": 21690117408960.0, + "grad_norm": 8.09402896192722, + "language_loss": 0.83849299, + "learning_rate": 3.98837155745293e-06, + "loss": 0.86969942, + "num_input_tokens_seen": 11264230, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.69726562, + "step": 527, + "time_per_iteration": 2.8976237773895264 + }, + { + "auxiliary_loss_clip": 0.01618928, + "auxiliary_loss_mlp": 0.01494907, + "balance_loss_clip": 1.24961233, + "balance_loss_mlp": 1.12469077, + "epoch": 0.06348824625744003, + "flos": 19502968203360.0, + "grad_norm": 4.265288826590074, + "language_loss": 0.76395375, + "learning_rate": 3.988287528218854e-06, + "loss": 0.79509211, + "num_input_tokens_seen": 11283015, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.69726562, + "step": 528, + "time_per_iteration": 3.0403804779052734 + }, + { + "auxiliary_loss_clip": 0.01624396, + "auxiliary_loss_mlp": 0.01504293, + "balance_loss_clip": 1.25480843, + "balance_loss_mlp": 1.13350463, + "epoch": 0.06360848914807912, + "flos": 15482732808960.0, + "grad_norm": 2.126482306677506, + "language_loss": 0.90446782, + "learning_rate": 3.98820319736314e-06, + "loss": 0.93575472, + "num_input_tokens_seen": 11299630, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.70507812, + "step": 529, + "time_per_iteration": 2.909414529800415 + }, + { + "auxiliary_loss_clip": 0.01620127, + "auxiliary_loss_mlp": 0.01503249, + "balance_loss_clip": 1.25065529, + "balance_loss_mlp": 1.13627493, + "epoch": 0.0637287320387182, + "flos": 20595385997280.0, + "grad_norm": 1.8620824661501747, + "language_loss": 0.85489762, + "learning_rate": 3.988118564898582e-06, + "loss": 0.88613141, + "num_input_tokens_seen": 11319170, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.66601562, + "step": 530, + "time_per_iteration": 2.9370791912078857 + }, + { + "auxiliary_loss_clip": 0.01620478, + "auxiliary_loss_mlp": 0.01531692, + "balance_loss_clip": 1.24896514, + "balance_loss_mlp": 1.1673882, + "epoch": 0.0638489749293573, + "flos": 17413674743520.0, + "grad_norm": 2.6296578588399258, + "language_loss": 0.89743936, + "learning_rate": 3.988033630838019e-06, + "loss": 0.92896104, + "num_input_tokens_seen": 11333210, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.640625, + "step": 531, + "time_per_iteration": 3.0020296573638916 + }, + { + "auxiliary_loss_clip": 0.0162167, + "auxiliary_loss_mlp": 0.01518643, + "balance_loss_clip": 1.25204301, + "balance_loss_mlp": 1.153386, + "epoch": 0.0639692178199964, + "flos": 23810057186400.0, + "grad_norm": 2.3168320524494055, + "language_loss": 0.88109994, + "learning_rate": 3.987948395194334e-06, + "loss": 0.91250306, + "num_input_tokens_seen": 11355590, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.65039062, + "step": 532, + "time_per_iteration": 2.9478888511657715 + }, + { + "auxiliary_loss_clip": 0.01614878, + "auxiliary_loss_mlp": 0.01505931, + "balance_loss_clip": 1.24458075, + "balance_loss_mlp": 1.14181769, + "epoch": 0.06408946071063548, + "flos": 18479125251360.0, + "grad_norm": 2.1515849289681777, + "language_loss": 0.76870346, + "learning_rate": 3.987862857980458e-06, + "loss": 0.79991162, + "num_input_tokens_seen": 11371535, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.63867188, + "step": 533, + "time_per_iteration": 2.9857566356658936 + }, + { + "auxiliary_loss_clip": 0.01615308, + "auxiliary_loss_mlp": 0.01508446, + "balance_loss_clip": 1.24652243, + "balance_loss_mlp": 1.13613176, + "epoch": 0.06420970360127458, + "flos": 27164684743200.0, + "grad_norm": 2.888745879079046, + "language_loss": 0.76948917, + "learning_rate": 3.987777019209368e-06, + "loss": 0.80072671, + "num_input_tokens_seen": 11392050, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.72070312, + "step": 534, + "time_per_iteration": 2.990412473678589 + }, + { + "auxiliary_loss_clip": 0.01622054, + "auxiliary_loss_mlp": 0.01534141, + "balance_loss_clip": 1.25258684, + "balance_loss_mlp": 1.16602254, + "epoch": 0.06432994649191366, + "flos": 23661946264320.0, + "grad_norm": 1.8454388763772775, + "language_loss": 0.81385219, + "learning_rate": 3.987690878894084e-06, + "loss": 0.84541404, + "num_input_tokens_seen": 11411765, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.68164062, + "step": 535, + "time_per_iteration": 3.8294458389282227 + }, + { + "auxiliary_loss_clip": 0.0162223, + "auxiliary_loss_mlp": 0.01495299, + "balance_loss_clip": 1.25240242, + "balance_loss_mlp": 1.12203074, + "epoch": 0.06445018938255276, + "flos": 23406194131200.0, + "grad_norm": 5.227703132624328, + "language_loss": 0.8511982, + "learning_rate": 3.987604437047673e-06, + "loss": 0.88237345, + "num_input_tokens_seen": 11431565, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.72851562, + "step": 536, + "time_per_iteration": 3.018927812576294 + }, + { + "auxiliary_loss_clip": 0.01618796, + "auxiliary_loss_mlp": 0.0150421, + "balance_loss_clip": 1.25078344, + "balance_loss_mlp": 1.12998819, + "epoch": 0.06457043227319184, + "flos": 19648765507680.0, + "grad_norm": 3.6366010657094727, + "language_loss": 0.77857977, + "learning_rate": 3.987517693683251e-06, + "loss": 0.8098098, + "num_input_tokens_seen": 11450140, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.73828125, + "step": 537, + "time_per_iteration": 3.030358076095581 + }, + { + "auxiliary_loss_clip": 0.0162274, + "auxiliary_loss_mlp": 0.01505364, + "balance_loss_clip": 1.25341737, + "balance_loss_mlp": 1.13018823, + "epoch": 0.06469067516383094, + "flos": 16980417000000.0, + "grad_norm": 4.0249466411404455, + "language_loss": 0.9623853, + "learning_rate": 3.9874306488139745e-06, + "loss": 0.99366629, + "num_input_tokens_seen": 11465400, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.75, + "step": 538, + "time_per_iteration": 3.7469139099121094 + }, + { + "auxiliary_loss_clip": 0.01620117, + "auxiliary_loss_mlp": 0.01498101, + "balance_loss_clip": 1.25329518, + "balance_loss_mlp": 1.12407041, + "epoch": 0.06481091805447003, + "flos": 23298742560960.0, + "grad_norm": 2.415648763648971, + "language_loss": 0.88128489, + "learning_rate": 3.987343302453049e-06, + "loss": 0.91246712, + "num_input_tokens_seen": 11486675, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.73632812, + "step": 539, + "time_per_iteration": 2.9881246089935303 + }, + { + "auxiliary_loss_clip": 0.01623972, + "auxiliary_loss_mlp": 0.01513963, + "balance_loss_clip": 1.25586772, + "balance_loss_mlp": 1.13554478, + "epoch": 0.06493116094510912, + "flos": 29175124829760.0, + "grad_norm": 1.6829827463419138, + "language_loss": 0.82775116, + "learning_rate": 3.987255654613724e-06, + "loss": 0.8591305, + "num_input_tokens_seen": 11510440, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.78125, + "step": 540, + "time_per_iteration": 4.777977705001831 + }, + { + "auxiliary_loss_clip": 0.01618628, + "auxiliary_loss_mlp": 0.01501351, + "balance_loss_clip": 1.24978924, + "balance_loss_mlp": 1.12865484, + "epoch": 0.06505140383574821, + "flos": 19867120110720.0, + "grad_norm": 3.7659859002422373, + "language_loss": 0.70387065, + "learning_rate": 3.987167705309296e-06, + "loss": 0.73507035, + "num_input_tokens_seen": 11529715, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.72070312, + "step": 541, + "time_per_iteration": 2.8921542167663574 + }, + { + "auxiliary_loss_clip": 0.01628352, + "auxiliary_loss_mlp": 0.01497878, + "balance_loss_clip": 1.26121187, + "balance_loss_mlp": 1.12842429, + "epoch": 0.0651716467263873, + "flos": 17926278926400.0, + "grad_norm": 2.1438859577563956, + "language_loss": 0.95432669, + "learning_rate": 3.987079454553108e-06, + "loss": 0.98558897, + "num_input_tokens_seen": 11547665, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.69335938, + "step": 542, + "time_per_iteration": 2.9543814659118652 + }, + { + "auxiliary_loss_clip": 0.01615983, + "auxiliary_loss_mlp": 0.01501399, + "balance_loss_clip": 1.247226, + "balance_loss_mlp": 1.12832153, + "epoch": 0.0652918896170264, + "flos": 20844538630560.0, + "grad_norm": 1.9106240085701138, + "language_loss": 0.91186398, + "learning_rate": 3.986990902358546e-06, + "loss": 0.94303787, + "num_input_tokens_seen": 11564605, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.7265625, + "step": 543, + "time_per_iteration": 2.910499334335327 + }, + { + "auxiliary_loss_clip": 0.01605884, + "auxiliary_loss_mlp": 0.01485252, + "balance_loss_clip": 1.23633146, + "balance_loss_mlp": 1.10855031, + "epoch": 0.06541213250766549, + "flos": 21874715585280.0, + "grad_norm": 2.0222477072760547, + "language_loss": 0.93325186, + "learning_rate": 3.986902048739045e-06, + "loss": 0.96416318, + "num_input_tokens_seen": 11584550, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.76367188, + "step": 544, + "time_per_iteration": 2.911634683609009 + }, + { + "auxiliary_loss_clip": 0.01615967, + "auxiliary_loss_mlp": 0.01510061, + "balance_loss_clip": 1.24848425, + "balance_loss_mlp": 1.14022601, + "epoch": 0.06553237539830457, + "flos": 23112892755360.0, + "grad_norm": 2.943239080802864, + "language_loss": 0.801135, + "learning_rate": 3.986812893708082e-06, + "loss": 0.8323952, + "num_input_tokens_seen": 11600740, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.69726562, + "step": 545, + "time_per_iteration": 2.9579854011535645 + }, + { + "auxiliary_loss_clip": 0.01617948, + "auxiliary_loss_mlp": 0.01508767, + "balance_loss_clip": 1.24873447, + "balance_loss_mlp": 1.13778734, + "epoch": 0.06565261828894367, + "flos": 17925292794240.0, + "grad_norm": 4.213413517942173, + "language_loss": 0.81477427, + "learning_rate": 3.9867234372791826e-06, + "loss": 0.84604144, + "num_input_tokens_seen": 11618695, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.70507812, + "step": 546, + "time_per_iteration": 3.0808591842651367 + }, + { + "auxiliary_loss_clip": 0.01612002, + "auxiliary_loss_mlp": 0.01484196, + "balance_loss_clip": 1.24417734, + "balance_loss_mlp": 1.11588728, + "epoch": 0.06577286117958275, + "flos": 22785228102240.0, + "grad_norm": 1.5556627831466197, + "language_loss": 0.87365955, + "learning_rate": 3.986633679465918e-06, + "loss": 0.90462154, + "num_input_tokens_seen": 11638850, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.6796875, + "step": 547, + "time_per_iteration": 2.976905107498169 + }, + { + "auxiliary_loss_clip": 0.01624837, + "auxiliary_loss_mlp": 0.01492644, + "balance_loss_clip": 1.25797582, + "balance_loss_mlp": 1.118613, + "epoch": 0.06589310407022185, + "flos": 23698471446720.0, + "grad_norm": 3.747589759729299, + "language_loss": 0.80940247, + "learning_rate": 3.986543620281904e-06, + "loss": 0.84057724, + "num_input_tokens_seen": 11658500, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.73828125, + "step": 548, + "time_per_iteration": 3.006171703338623 + }, + { + "auxiliary_loss_clip": 0.0160893, + "auxiliary_loss_mlp": 0.01494116, + "balance_loss_clip": 1.24026787, + "balance_loss_mlp": 1.11665142, + "epoch": 0.06601334696086093, + "flos": 26866907844480.0, + "grad_norm": 1.7533223380579717, + "language_loss": 0.91267037, + "learning_rate": 3.986453259740802e-06, + "loss": 0.94370079, + "num_input_tokens_seen": 11676670, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.77148438, + "step": 549, + "time_per_iteration": 2.965430736541748 + }, + { + "auxiliary_loss_clip": 0.01624422, + "auxiliary_loss_mlp": 0.01523334, + "balance_loss_clip": 1.25661087, + "balance_loss_mlp": 1.15025675, + "epoch": 0.06613358985150003, + "flos": 12569327909280.0, + "grad_norm": 2.9909258952530977, + "language_loss": 0.7946406, + "learning_rate": 3.986362597856319e-06, + "loss": 0.82611817, + "num_input_tokens_seen": 11693170, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.7265625, + "step": 550, + "time_per_iteration": 3.031940460205078 + }, + { + "auxiliary_loss_clip": 0.01612465, + "auxiliary_loss_mlp": 0.01503357, + "balance_loss_clip": 1.24313831, + "balance_loss_mlp": 1.12474799, + "epoch": 0.06625383274213913, + "flos": 18334465791840.0, + "grad_norm": 2.734889335287335, + "language_loss": 0.81539446, + "learning_rate": 3.986271634642211e-06, + "loss": 0.84655273, + "num_input_tokens_seen": 11710150, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.78515625, + "step": 551, + "time_per_iteration": 2.9194436073303223 + }, + { + "auxiliary_loss_clip": 0.01620893, + "auxiliary_loss_mlp": 0.01515508, + "balance_loss_clip": 1.25191617, + "balance_loss_mlp": 1.13384724, + "epoch": 0.06637407563277821, + "flos": 15377291431200.0, + "grad_norm": 2.1480846192204135, + "language_loss": 0.81704152, + "learning_rate": 3.986180370112274e-06, + "loss": 0.84840548, + "num_input_tokens_seen": 11726670, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.8125, + "step": 552, + "time_per_iteration": 3.0064406394958496 + }, + { + "auxiliary_loss_clip": 0.01611188, + "auxiliary_loss_mlp": 0.0148409, + "balance_loss_clip": 1.24158621, + "balance_loss_mlp": 1.09499049, + "epoch": 0.0664943185234173, + "flos": 24027577369920.0, + "grad_norm": 1.8247135767617615, + "language_loss": 0.74696428, + "learning_rate": 3.986088804280354e-06, + "loss": 0.77791709, + "num_input_tokens_seen": 11746400, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.88867188, + "step": 553, + "time_per_iteration": 3.0191264152526855 + }, + { + "auxiliary_loss_clip": 0.01620359, + "auxiliary_loss_mlp": 0.01493854, + "balance_loss_clip": 1.25273836, + "balance_loss_mlp": 1.11524546, + "epoch": 0.06661456141405639, + "flos": 20959499976480.0, + "grad_norm": 2.6184401362920444, + "language_loss": 0.94305825, + "learning_rate": 3.985996937160342e-06, + "loss": 0.97420037, + "num_input_tokens_seen": 11765590, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.78320312, + "step": 554, + "time_per_iteration": 2.958496570587158 + }, + { + "auxiliary_loss_clip": 0.01619558, + "auxiliary_loss_mlp": 0.01491312, + "balance_loss_clip": 1.25197983, + "balance_loss_mlp": 1.1127038, + "epoch": 0.06673480430469549, + "flos": 52227597661920.0, + "grad_norm": 2.8373609003859728, + "language_loss": 0.68944579, + "learning_rate": 3.985904768766173e-06, + "loss": 0.72055447, + "num_input_tokens_seen": 11788365, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.78320312, + "step": 555, + "time_per_iteration": 3.26244854927063 + }, + { + "auxiliary_loss_clip": 0.01614812, + "auxiliary_loss_mlp": 0.01491678, + "balance_loss_clip": 1.24541497, + "balance_loss_mlp": 1.11459541, + "epoch": 0.06685504719533458, + "flos": 16218963609120.0, + "grad_norm": 6.189698515837439, + "language_loss": 0.75953853, + "learning_rate": 3.98581229911183e-06, + "loss": 0.7906034, + "num_input_tokens_seen": 11807285, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.76757812, + "step": 556, + "time_per_iteration": 3.1750001907348633 + }, + { + "auxiliary_loss_clip": 0.01608891, + "auxiliary_loss_mlp": 0.01504772, + "balance_loss_clip": 1.23957491, + "balance_loss_mlp": 1.1233027, + "epoch": 0.06697529008597367, + "flos": 22493709349920.0, + "grad_norm": 10.599703330698445, + "language_loss": 0.92257023, + "learning_rate": 3.985719528211341e-06, + "loss": 0.95370686, + "num_input_tokens_seen": 11826655, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.8125, + "step": 557, + "time_per_iteration": 3.0524628162384033 + }, + { + "auxiliary_loss_clip": 0.01660154, + "auxiliary_loss_mlp": 0.01428246, + "balance_loss_clip": 1.31030393, + "balance_loss_mlp": 1.07653046, + "epoch": 0.06709553297661276, + "flos": 62694091393920.0, + "grad_norm": 0.8567980403166309, + "language_loss": 0.63028216, + "learning_rate": 3.985626456078777e-06, + "loss": 0.66116619, + "num_input_tokens_seen": 11891310, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 3.5234375, + "step": 558, + "time_per_iteration": 3.5849475860595703 + }, + { + "auxiliary_loss_clip": 0.01612702, + "auxiliary_loss_mlp": 0.01503965, + "balance_loss_clip": 1.24374628, + "balance_loss_mlp": 1.12897992, + "epoch": 0.06721577586725185, + "flos": 11218085801280.0, + "grad_norm": 2.6712936709686024, + "language_loss": 0.86487174, + "learning_rate": 3.985533082728259e-06, + "loss": 0.89603847, + "num_input_tokens_seen": 11906965, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.74609375, + "step": 559, + "time_per_iteration": 3.0756771564483643 + }, + { + "auxiliary_loss_clip": 0.01610107, + "auxiliary_loss_mlp": 0.01494326, + "balance_loss_clip": 1.24199557, + "balance_loss_mlp": 1.12143946, + "epoch": 0.06733601875789094, + "flos": 25924611165120.0, + "grad_norm": 2.0360293949272488, + "language_loss": 0.74759769, + "learning_rate": 3.985439408173951e-06, + "loss": 0.77864206, + "num_input_tokens_seen": 11927190, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.72460938, + "step": 560, + "time_per_iteration": 3.143470048904419 + }, + { + "auxiliary_loss_clip": 0.01619121, + "auxiliary_loss_mlp": 0.01496991, + "balance_loss_clip": 1.25077009, + "balance_loss_mlp": 1.11857283, + "epoch": 0.06745626164853002, + "flos": 20815750792800.0, + "grad_norm": 2.4444674931061896, + "language_loss": 0.70742047, + "learning_rate": 3.9853454324300634e-06, + "loss": 0.7385816, + "num_input_tokens_seen": 11946400, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.78125, + "step": 561, + "time_per_iteration": 2.9597833156585693 + }, + { + "auxiliary_loss_clip": 0.01617719, + "auxiliary_loss_mlp": 0.01491402, + "balance_loss_clip": 1.24947023, + "balance_loss_mlp": 1.12156713, + "epoch": 0.06757650453916912, + "flos": 19831353491520.0, + "grad_norm": 2.750289452660925, + "language_loss": 0.77968895, + "learning_rate": 3.985251155510852e-06, + "loss": 0.81078023, + "num_input_tokens_seen": 11965430, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.69726562, + "step": 562, + "time_per_iteration": 3.7309200763702393 + }, + { + "auxiliary_loss_clip": 0.01615615, + "auxiliary_loss_mlp": 0.01511443, + "balance_loss_clip": 1.24741864, + "balance_loss_mlp": 1.14046371, + "epoch": 0.06769674742980822, + "flos": 25741605971520.0, + "grad_norm": 1.7753344290979574, + "language_loss": 0.80528986, + "learning_rate": 3.98515657743062e-06, + "loss": 0.83656049, + "num_input_tokens_seen": 11984895, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.70898438, + "step": 563, + "time_per_iteration": 3.0401546955108643 + }, + { + "auxiliary_loss_clip": 0.01617867, + "auxiliary_loss_mlp": 0.01493902, + "balance_loss_clip": 1.2507236, + "balance_loss_mlp": 1.11872649, + "epoch": 0.0678169903204473, + "flos": 13076242868160.0, + "grad_norm": 2.4670070052179014, + "language_loss": 0.77855754, + "learning_rate": 3.985061698203711e-06, + "loss": 0.80967522, + "num_input_tokens_seen": 12002010, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.74804688, + "step": 564, + "time_per_iteration": 2.9773948192596436 + }, + { + "auxiliary_loss_clip": 0.01649722, + "auxiliary_loss_mlp": 0.01427025, + "balance_loss_clip": 1.29997063, + "balance_loss_mlp": 1.08827972, + "epoch": 0.0679372332110864, + "flos": 70872432501600.0, + "grad_norm": 0.9185331684817385, + "language_loss": 0.63808143, + "learning_rate": 3.984966517844523e-06, + "loss": 0.66884887, + "num_input_tokens_seen": 12057255, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 3.390625, + "step": 565, + "time_per_iteration": 4.2834625244140625 + }, + { + "auxiliary_loss_clip": 0.01617024, + "auxiliary_loss_mlp": 0.01492534, + "balance_loss_clip": 1.24686444, + "balance_loss_mlp": 1.11011064, + "epoch": 0.06805747610172548, + "flos": 28258923088800.0, + "grad_norm": 3.475376505728271, + "language_loss": 0.80268002, + "learning_rate": 3.984871036367492e-06, + "loss": 0.83377564, + "num_input_tokens_seen": 12077280, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.82226562, + "step": 566, + "time_per_iteration": 3.0244853496551514 + }, + { + "auxiliary_loss_clip": 0.01626148, + "auxiliary_loss_mlp": 0.01490223, + "balance_loss_clip": 1.25864124, + "balance_loss_mlp": 1.11657333, + "epoch": 0.06817771899236458, + "flos": 20122606746720.0, + "grad_norm": 2.199908044695027, + "language_loss": 0.83453482, + "learning_rate": 3.984775253787102e-06, + "loss": 0.86569858, + "num_input_tokens_seen": 12095570, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.734375, + "step": 567, + "time_per_iteration": 4.718690395355225 + }, + { + "auxiliary_loss_clip": 0.01621105, + "auxiliary_loss_mlp": 0.01492479, + "balance_loss_clip": 1.25217056, + "balance_loss_mlp": 1.11406112, + "epoch": 0.06829796188300366, + "flos": 17932537072800.0, + "grad_norm": 3.304875552206939, + "language_loss": 0.88120693, + "learning_rate": 3.984679170117885e-06, + "loss": 0.91234279, + "num_input_tokens_seen": 12111775, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.78125, + "step": 568, + "time_per_iteration": 3.0503296852111816 + }, + { + "auxiliary_loss_clip": 0.01620886, + "auxiliary_loss_mlp": 0.01497139, + "balance_loss_clip": 1.2527926, + "balance_loss_mlp": 1.12463427, + "epoch": 0.06841820477364276, + "flos": 14503379952960.0, + "grad_norm": 4.680816963283824, + "language_loss": 0.785586, + "learning_rate": 3.984582785374415e-06, + "loss": 0.8167662, + "num_input_tokens_seen": 12129215, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.72070312, + "step": 569, + "time_per_iteration": 3.015110731124878 + }, + { + "auxiliary_loss_clip": 0.01621574, + "auxiliary_loss_mlp": 0.0152291, + "balance_loss_clip": 1.25414431, + "balance_loss_mlp": 1.14964175, + "epoch": 0.06853844766428185, + "flos": 21940294102560.0, + "grad_norm": 2.5724091954316517, + "language_loss": 0.80693281, + "learning_rate": 3.9844860995713155e-06, + "loss": 0.83837765, + "num_input_tokens_seen": 12148755, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.73242188, + "step": 570, + "time_per_iteration": 3.1310243606567383 + }, + { + "auxiliary_loss_clip": 0.01626191, + "auxiliary_loss_mlp": 0.01515477, + "balance_loss_clip": 1.25852525, + "balance_loss_mlp": 1.14239931, + "epoch": 0.06865869055492094, + "flos": 16802532108000.0, + "grad_norm": 2.520568931920914, + "language_loss": 0.83141416, + "learning_rate": 3.9843891127232524e-06, + "loss": 0.86283082, + "num_input_tokens_seen": 12166290, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.73046875, + "step": 571, + "time_per_iteration": 3.0151562690734863 + }, + { + "auxiliary_loss_clip": 0.01614664, + "auxiliary_loss_mlp": 0.01504767, + "balance_loss_clip": 1.24752402, + "balance_loss_mlp": 1.1320715, + "epoch": 0.06877893344556003, + "flos": 19939298127840.0, + "grad_norm": 3.2869639442767737, + "language_loss": 0.67086166, + "learning_rate": 3.984291824844938e-06, + "loss": 0.70205593, + "num_input_tokens_seen": 12181385, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.72460938, + "step": 572, + "time_per_iteration": 3.0145223140716553 + }, + { + "auxiliary_loss_clip": 0.01616844, + "auxiliary_loss_mlp": 0.01494658, + "balance_loss_clip": 1.24864411, + "balance_loss_mlp": 1.12215257, + "epoch": 0.06889917633619912, + "flos": 23041625014080.0, + "grad_norm": 37.44169068041083, + "language_loss": 0.85389626, + "learning_rate": 3.984194235951132e-06, + "loss": 0.88501132, + "num_input_tokens_seen": 12197530, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.72265625, + "step": 573, + "time_per_iteration": 2.9771077632904053 + }, + { + "auxiliary_loss_clip": 0.01629736, + "auxiliary_loss_mlp": 0.01502674, + "balance_loss_clip": 1.26215506, + "balance_loss_mlp": 1.12006044, + "epoch": 0.06901941922683821, + "flos": 20962723870080.0, + "grad_norm": 4.0099024480722285, + "language_loss": 0.85259569, + "learning_rate": 3.9840963460566375e-06, + "loss": 0.88391978, + "num_input_tokens_seen": 12216310, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.8203125, + "step": 574, + "time_per_iteration": 3.006836175918579 + }, + { + "auxiliary_loss_clip": 0.01617913, + "auxiliary_loss_mlp": 0.01480282, + "balance_loss_clip": 1.24981546, + "balance_loss_mlp": 1.10377121, + "epoch": 0.06913966211747731, + "flos": 24823887104160.0, + "grad_norm": 1.6673563840263226, + "language_loss": 0.89637518, + "learning_rate": 3.983998155176305e-06, + "loss": 0.92735714, + "num_input_tokens_seen": 12236670, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.76171875, + "step": 575, + "time_per_iteration": 3.249457359313965 + }, + { + "auxiliary_loss_clip": 0.01650039, + "auxiliary_loss_mlp": 0.01445938, + "balance_loss_clip": 1.30291545, + "balance_loss_mlp": 1.10566711, + "epoch": 0.06925990500811639, + "flos": 58374258184800.0, + "grad_norm": 0.8321213894163402, + "language_loss": 0.56958508, + "learning_rate": 3.9838996633250305e-06, + "loss": 0.60054481, + "num_input_tokens_seen": 12297185, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 3.40625, + "step": 576, + "time_per_iteration": 3.4213693141937256 + }, + { + "auxiliary_loss_clip": 0.01623851, + "auxiliary_loss_mlp": 0.01493084, + "balance_loss_clip": 1.25767565, + "balance_loss_mlp": 1.12153268, + "epoch": 0.06938014789875549, + "flos": 12751574539680.0, + "grad_norm": 2.2411724591274766, + "language_loss": 0.88190418, + "learning_rate": 3.983800870517753e-06, + "loss": 0.91307354, + "num_input_tokens_seen": 12313975, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.71484375, + "step": 577, + "time_per_iteration": 2.963423490524292 + }, + { + "auxiliary_loss_clip": 0.01627713, + "auxiliary_loss_mlp": 0.01513819, + "balance_loss_clip": 1.26112068, + "balance_loss_mlp": 1.14341223, + "epoch": 0.06950039078939457, + "flos": 22822435991520.0, + "grad_norm": 2.8928614415696514, + "language_loss": 0.7837792, + "learning_rate": 3.983701776769463e-06, + "loss": 0.81519461, + "num_input_tokens_seen": 12331385, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.703125, + "step": 578, + "time_per_iteration": 2.9951162338256836 + }, + { + "auxiliary_loss_clip": 0.01626204, + "auxiliary_loss_mlp": 0.0149271, + "balance_loss_clip": 1.25882411, + "balance_loss_mlp": 1.1211586, + "epoch": 0.06962063368003367, + "flos": 21943328355360.0, + "grad_norm": 1.9950127299397507, + "language_loss": 0.85623652, + "learning_rate": 3.9836023820951885e-06, + "loss": 0.88742566, + "num_input_tokens_seen": 12350600, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.71289062, + "step": 579, + "time_per_iteration": 2.9291067123413086 + }, + { + "auxiliary_loss_clip": 0.01616855, + "auxiliary_loss_mlp": 0.01491974, + "balance_loss_clip": 1.24824619, + "balance_loss_mlp": 1.11870623, + "epoch": 0.06974087657067275, + "flos": 20708223366240.0, + "grad_norm": 3.1039924345397205, + "language_loss": 0.68804926, + "learning_rate": 3.983502686510011e-06, + "loss": 0.71913755, + "num_input_tokens_seen": 12371430, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.72851562, + "step": 580, + "time_per_iteration": 2.9843943119049072 + }, + { + "auxiliary_loss_clip": 0.01622738, + "auxiliary_loss_mlp": 0.01496279, + "balance_loss_clip": 1.2564106, + "balance_loss_mlp": 1.12034059, + "epoch": 0.06986111946131185, + "flos": 22640454858240.0, + "grad_norm": 2.4137470155481875, + "language_loss": 0.73719704, + "learning_rate": 3.9834026900290525e-06, + "loss": 0.76838726, + "num_input_tokens_seen": 12390825, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.75585938, + "step": 581, + "time_per_iteration": 2.989563465118408 + }, + { + "auxiliary_loss_clip": 0.01622085, + "auxiliary_loss_mlp": 0.0148683, + "balance_loss_clip": 1.25482559, + "balance_loss_mlp": 1.10326219, + "epoch": 0.06998136235195095, + "flos": 26945912930400.0, + "grad_norm": 2.8463395792743795, + "language_loss": 1.00045991, + "learning_rate": 3.983302392667482e-06, + "loss": 1.0315491, + "num_input_tokens_seen": 12411670, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.83398438, + "step": 582, + "time_per_iteration": 2.983837604522705 + }, + { + "auxiliary_loss_clip": 0.0162635, + "auxiliary_loss_mlp": 0.01496334, + "balance_loss_clip": 1.2600863, + "balance_loss_mlp": 1.12421, + "epoch": 0.07010160524259003, + "flos": 22494733410240.0, + "grad_norm": 2.0827096989512857, + "language_loss": 0.93824023, + "learning_rate": 3.983201794440517e-06, + "loss": 0.96946704, + "num_input_tokens_seen": 12431245, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.72070312, + "step": 583, + "time_per_iteration": 2.943354845046997 + }, + { + "auxiliary_loss_clip": 0.01627888, + "auxiliary_loss_mlp": 0.01502883, + "balance_loss_clip": 1.26035643, + "balance_loss_mlp": 1.12522817, + "epoch": 0.07022184813322913, + "flos": 18334503720000.0, + "grad_norm": 2.0589197159712933, + "language_loss": 0.6785773, + "learning_rate": 3.9831008953634165e-06, + "loss": 0.709885, + "num_input_tokens_seen": 12450535, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.7734375, + "step": 584, + "time_per_iteration": 2.964200019836426 + }, + { + "auxiliary_loss_clip": 0.01627075, + "auxiliary_loss_mlp": 0.01498945, + "balance_loss_clip": 1.2592175, + "balance_loss_mlp": 1.12148058, + "epoch": 0.07034209102386821, + "flos": 24677558805600.0, + "grad_norm": 1.940816168062492, + "language_loss": 0.81306791, + "learning_rate": 3.9829996954514864e-06, + "loss": 0.84432805, + "num_input_tokens_seen": 12469675, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.77148438, + "step": 585, + "time_per_iteration": 3.153329610824585 + }, + { + "auxiliary_loss_clip": 0.01628056, + "auxiliary_loss_mlp": 0.01492885, + "balance_loss_clip": 1.26156223, + "balance_loss_mlp": 1.11809063, + "epoch": 0.0704623339145073, + "flos": 25998344236800.0, + "grad_norm": 2.753247564014433, + "language_loss": 0.84255219, + "learning_rate": 3.982898194720079e-06, + "loss": 0.87376153, + "num_input_tokens_seen": 12490405, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.74414062, + "step": 586, + "time_per_iteration": 3.0260210037231445 + }, + { + "auxiliary_loss_clip": 0.01629097, + "auxiliary_loss_mlp": 0.01496367, + "balance_loss_clip": 1.26188684, + "balance_loss_mlp": 1.11947501, + "epoch": 0.0705825768051464, + "flos": 25340701312800.0, + "grad_norm": 3.2470187762342033, + "language_loss": 0.82619262, + "learning_rate": 3.982796393184592e-06, + "loss": 0.85744727, + "num_input_tokens_seen": 12509485, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.765625, + "step": 587, + "time_per_iteration": 2.958587408065796 + }, + { + "auxiliary_loss_clip": 0.01657781, + "auxiliary_loss_mlp": 0.01435905, + "balance_loss_clip": 1.30913174, + "balance_loss_mlp": 1.09792328, + "epoch": 0.07070281969578548, + "flos": 66053763396000.0, + "grad_norm": 0.7970610258731058, + "language_loss": 0.62578279, + "learning_rate": 3.98269429086047e-06, + "loss": 0.65671962, + "num_input_tokens_seen": 12567325, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 3.3828125, + "step": 588, + "time_per_iteration": 3.3900907039642334 + }, + { + "auxiliary_loss_clip": 0.01618735, + "auxiliary_loss_mlp": 0.01504582, + "balance_loss_clip": 1.25069439, + "balance_loss_mlp": 1.12959754, + "epoch": 0.07082306258642458, + "flos": 23655498477120.0, + "grad_norm": 2.7567277632908365, + "language_loss": 0.86413264, + "learning_rate": 3.982591887763199e-06, + "loss": 0.89536583, + "num_input_tokens_seen": 12584785, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.75, + "step": 589, + "time_per_iteration": 3.887899875640869 + }, + { + "auxiliary_loss_clip": 0.01616633, + "auxiliary_loss_mlp": 0.01492119, + "balance_loss_clip": 1.24822521, + "balance_loss_mlp": 1.11122131, + "epoch": 0.07094330547706366, + "flos": 13881693288960.0, + "grad_norm": 2.1555914704708794, + "language_loss": 0.82012659, + "learning_rate": 3.982489183908316e-06, + "loss": 0.85121411, + "num_input_tokens_seen": 12601205, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.8046875, + "step": 590, + "time_per_iteration": 2.9375176429748535 + }, + { + "auxiliary_loss_clip": 0.01619337, + "auxiliary_loss_mlp": 0.01491322, + "balance_loss_clip": 1.25102472, + "balance_loss_mlp": 1.11385751, + "epoch": 0.07106354836770276, + "flos": 24647443482240.0, + "grad_norm": 2.461541135715542, + "language_loss": 0.84638077, + "learning_rate": 3.982386179311399e-06, + "loss": 0.87748736, + "num_input_tokens_seen": 12621725, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.76953125, + "step": 591, + "time_per_iteration": 3.016637086868286 + }, + { + "auxiliary_loss_clip": 0.01615142, + "auxiliary_loss_mlp": 0.01498467, + "balance_loss_clip": 1.24863136, + "balance_loss_mlp": 1.11756957, + "epoch": 0.07118379125834184, + "flos": 16219115321760.0, + "grad_norm": 2.2173062277622386, + "language_loss": 0.87511224, + "learning_rate": 3.982282873988075e-06, + "loss": 0.90624833, + "num_input_tokens_seen": 12639600, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 3.80664062, + "step": 592, + "time_per_iteration": 3.0196151733398438 + }, + { + "auxiliary_loss_clip": 0.01614559, + "auxiliary_loss_mlp": 0.01492694, + "balance_loss_clip": 1.24708056, + "balance_loss_mlp": 1.11484873, + "epoch": 0.07130403414898094, + "flos": 19722081369600.0, + "grad_norm": 2.3931385575658384, + "language_loss": 0.8700946, + "learning_rate": 3.982179267954016e-06, + "loss": 0.90116715, + "num_input_tokens_seen": 12660030, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.77539062, + "step": 593, + "time_per_iteration": 3.8983843326568604 + }, + { + "auxiliary_loss_clip": 0.01611784, + "auxiliary_loss_mlp": 0.01498911, + "balance_loss_clip": 1.24313581, + "balance_loss_mlp": 1.11820412, + "epoch": 0.07142427703962004, + "flos": 21874108734720.0, + "grad_norm": 207.33574456677331, + "language_loss": 0.95983887, + "learning_rate": 3.982075361224937e-06, + "loss": 0.99094582, + "num_input_tokens_seen": 12678395, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.8046875, + "step": 594, + "time_per_iteration": 3.859348773956299 + }, + { + "auxiliary_loss_clip": 0.01614859, + "auxiliary_loss_mlp": 0.0150219, + "balance_loss_clip": 1.24711156, + "balance_loss_mlp": 1.12644243, + "epoch": 0.07154451993025912, + "flos": 18298585388160.0, + "grad_norm": 1.9015778115923243, + "language_loss": 0.88201928, + "learning_rate": 3.981971153816602e-06, + "loss": 0.91318983, + "num_input_tokens_seen": 12696000, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.7578125, + "step": 595, + "time_per_iteration": 3.8413000106811523 + }, + { + "auxiliary_loss_clip": 0.01622665, + "auxiliary_loss_mlp": 0.01529246, + "balance_loss_clip": 1.25405526, + "balance_loss_mlp": 1.15712214, + "epoch": 0.07166476282089822, + "flos": 22162707018720.0, + "grad_norm": 1.6969071377265679, + "language_loss": 0.96488965, + "learning_rate": 3.981866645744819e-06, + "loss": 0.99640876, + "num_input_tokens_seen": 12716715, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.71679688, + "step": 596, + "time_per_iteration": 3.018815517425537 + }, + { + "auxiliary_loss_clip": 0.01607973, + "auxiliary_loss_mlp": 0.01486572, + "balance_loss_clip": 1.23766851, + "balance_loss_mlp": 1.11025167, + "epoch": 0.0717850057115373, + "flos": 14138052272640.0, + "grad_norm": 2.9315672493147504, + "language_loss": 0.81636411, + "learning_rate": 3.9817618370254416e-06, + "loss": 0.84730959, + "num_input_tokens_seen": 12733370, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.76171875, + "step": 597, + "time_per_iteration": 3.0676932334899902 + }, + { + "auxiliary_loss_clip": 0.01610543, + "auxiliary_loss_mlp": 0.01496096, + "balance_loss_clip": 1.24069786, + "balance_loss_mlp": 1.11920393, + "epoch": 0.0719052486021764, + "flos": 30920330743200.0, + "grad_norm": 2.2824686858787335, + "language_loss": 0.87317717, + "learning_rate": 3.9816567276743684e-06, + "loss": 0.90424359, + "num_input_tokens_seen": 12753235, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.765625, + "step": 598, + "time_per_iteration": 2.9913277626037598 + }, + { + "auxiliary_loss_clip": 0.01615401, + "auxiliary_loss_mlp": 0.01488932, + "balance_loss_clip": 1.2468797, + "balance_loss_mlp": 1.11146796, + "epoch": 0.0720254914928155, + "flos": 21289098965760.0, + "grad_norm": 2.1441994661125396, + "language_loss": 0.77432382, + "learning_rate": 3.9815513177075466e-06, + "loss": 0.80536711, + "num_input_tokens_seen": 12772020, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.77148438, + "step": 599, + "time_per_iteration": 2.9166977405548096 + }, + { + "auxiliary_loss_clip": 0.01622445, + "auxiliary_loss_mlp": 0.01499675, + "balance_loss_clip": 1.2528466, + "balance_loss_mlp": 1.12736011, + "epoch": 0.07214573438345458, + "flos": 27821682888480.0, + "grad_norm": 2.986295190531752, + "language_loss": 0.70602971, + "learning_rate": 3.9814456071409646e-06, + "loss": 0.73725092, + "num_input_tokens_seen": 12792555, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.71875, + "step": 600, + "time_per_iteration": 3.0125789642333984 + }, + { + "auxiliary_loss_clip": 0.01609795, + "auxiliary_loss_mlp": 0.01486071, + "balance_loss_clip": 1.23965645, + "balance_loss_mlp": 1.10593665, + "epoch": 0.07226597727409367, + "flos": 25485929694720.0, + "grad_norm": 2.841261916116731, + "language_loss": 0.8593024, + "learning_rate": 3.981339595990659e-06, + "loss": 0.89026099, + "num_input_tokens_seen": 12811085, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.79882812, + "step": 601, + "time_per_iteration": 2.946605682373047 + }, + { + "auxiliary_loss_clip": 0.01610465, + "auxiliary_loss_mlp": 0.01478843, + "balance_loss_clip": 1.24008286, + "balance_loss_mlp": 1.09737277, + "epoch": 0.07238622016473276, + "flos": 23516111031840.0, + "grad_norm": 2.8827334977652725, + "language_loss": 0.81480604, + "learning_rate": 3.981233284272713e-06, + "loss": 0.84569913, + "num_input_tokens_seen": 12830830, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.81054688, + "step": 602, + "time_per_iteration": 2.9312777519226074 + }, + { + "auxiliary_loss_clip": 0.01614268, + "auxiliary_loss_mlp": 0.01501111, + "balance_loss_clip": 1.24463427, + "balance_loss_mlp": 1.13261151, + "epoch": 0.07250646305537185, + "flos": 25456231581120.0, + "grad_norm": 1.8336617792478533, + "language_loss": 0.8976832, + "learning_rate": 3.981126672003253e-06, + "loss": 0.928837, + "num_input_tokens_seen": 12853505, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.68554688, + "step": 603, + "time_per_iteration": 2.9705843925476074 + }, + { + "auxiliary_loss_clip": 0.01607206, + "auxiliary_loss_mlp": 0.01480275, + "balance_loss_clip": 1.23591256, + "balance_loss_mlp": 1.10490918, + "epoch": 0.07262670594601094, + "flos": 27157099111200.0, + "grad_norm": 3.7679696121817865, + "language_loss": 0.78161788, + "learning_rate": 3.981019759198451e-06, + "loss": 0.81249273, + "num_input_tokens_seen": 12872455, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.75390625, + "step": 604, + "time_per_iteration": 2.9903299808502197 + }, + { + "auxiliary_loss_clip": 0.01600652, + "auxiliary_loss_mlp": 0.01488122, + "balance_loss_clip": 1.23032951, + "balance_loss_mlp": 1.10646152, + "epoch": 0.07274694883665003, + "flos": 26654052824640.0, + "grad_norm": 2.640311800385749, + "language_loss": 0.8457132, + "learning_rate": 3.980912545874528e-06, + "loss": 0.87660098, + "num_input_tokens_seen": 12892620, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.8125, + "step": 605, + "time_per_iteration": 3.0275371074676514 + }, + { + "auxiliary_loss_clip": 0.01607079, + "auxiliary_loss_mlp": 0.01506163, + "balance_loss_clip": 1.23613489, + "balance_loss_mlp": 1.12316787, + "epoch": 0.07286719172728913, + "flos": 29864969125920.0, + "grad_norm": 2.962978498131093, + "language_loss": 0.85618663, + "learning_rate": 3.980805032047746e-06, + "loss": 0.88731903, + "num_input_tokens_seen": 12914090, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.828125, + "step": 606, + "time_per_iteration": 2.912750005722046 + }, + { + "auxiliary_loss_clip": 0.01605139, + "auxiliary_loss_mlp": 0.01494053, + "balance_loss_clip": 1.23347425, + "balance_loss_mlp": 1.11525345, + "epoch": 0.07298743461792821, + "flos": 17383862845440.0, + "grad_norm": 1.9645762015264856, + "language_loss": 0.81308019, + "learning_rate": 3.980697217734415e-06, + "loss": 0.8440721, + "num_input_tokens_seen": 12931830, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.78515625, + "step": 607, + "time_per_iteration": 2.996603488922119 + }, + { + "auxiliary_loss_clip": 0.0159989, + "auxiliary_loss_mlp": 0.01492005, + "balance_loss_clip": 1.22657967, + "balance_loss_mlp": 1.11072588, + "epoch": 0.07310767750856731, + "flos": 19500123591360.0, + "grad_norm": 2.2446581987467114, + "language_loss": 0.92056692, + "learning_rate": 3.980589102950891e-06, + "loss": 0.95148587, + "num_input_tokens_seen": 12949995, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.80859375, + "step": 608, + "time_per_iteration": 3.0735602378845215 + }, + { + "auxiliary_loss_clip": 0.01612626, + "auxiliary_loss_mlp": 0.01513918, + "balance_loss_clip": 1.24094331, + "balance_loss_mlp": 1.13530898, + "epoch": 0.07322792039920639, + "flos": 29171256157440.0, + "grad_norm": 4.312769409837412, + "language_loss": 0.76024598, + "learning_rate": 3.9804806877135755e-06, + "loss": 0.79151142, + "num_input_tokens_seen": 12968040, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.78710938, + "step": 609, + "time_per_iteration": 3.049607515335083 + }, + { + "auxiliary_loss_clip": 0.01606703, + "auxiliary_loss_mlp": 0.01503828, + "balance_loss_clip": 1.23469937, + "balance_loss_mlp": 1.12178588, + "epoch": 0.07334816328984549, + "flos": 23480116843680.0, + "grad_norm": 2.3162821793025072, + "language_loss": 0.86769706, + "learning_rate": 3.980371972038915e-06, + "loss": 0.8988024, + "num_input_tokens_seen": 12988530, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.81445312, + "step": 610, + "time_per_iteration": 3.029317855834961 + }, + { + "auxiliary_loss_clip": 0.01615416, + "auxiliary_loss_mlp": 0.01536861, + "balance_loss_clip": 1.24472272, + "balance_loss_mlp": 1.16912436, + "epoch": 0.07346840618048459, + "flos": 22964478408000.0, + "grad_norm": 1.7123064437706812, + "language_loss": 0.84319472, + "learning_rate": 3.980262955943399e-06, + "loss": 0.87471747, + "num_input_tokens_seen": 13008195, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 3.67382812, + "step": 611, + "time_per_iteration": 2.9505503177642822 + }, + { + "auxiliary_loss_clip": 0.01610817, + "auxiliary_loss_mlp": 0.01506565, + "balance_loss_clip": 1.23885894, + "balance_loss_mlp": 1.13825583, + "epoch": 0.07358864907112367, + "flos": 17675722951200.0, + "grad_norm": 3.2222967017715116, + "language_loss": 0.87149191, + "learning_rate": 3.980153639443569e-06, + "loss": 0.90266573, + "num_input_tokens_seen": 13024180, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.68164062, + "step": 612, + "time_per_iteration": 2.990020990371704 + }, + { + "auxiliary_loss_clip": 0.01612602, + "auxiliary_loss_mlp": 0.01495664, + "balance_loss_clip": 1.23987067, + "balance_loss_mlp": 1.11724615, + "epoch": 0.07370889196176277, + "flos": 24099300249120.0, + "grad_norm": 2.1376379412593907, + "language_loss": 0.80730247, + "learning_rate": 3.980044022556005e-06, + "loss": 0.83838511, + "num_input_tokens_seen": 13043865, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.78125, + "step": 613, + "time_per_iteration": 3.0427515506744385 + }, + { + "auxiliary_loss_clip": 0.01600719, + "auxiliary_loss_mlp": 0.01500447, + "balance_loss_clip": 1.22759593, + "balance_loss_mlp": 1.11649752, + "epoch": 0.07382913485240185, + "flos": 25888351479840.0, + "grad_norm": 3.165510018681911, + "language_loss": 0.73144925, + "learning_rate": 3.9799341052973375e-06, + "loss": 0.76246095, + "num_input_tokens_seen": 13063700, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.83398438, + "step": 614, + "time_per_iteration": 2.9559543132781982 + }, + { + "auxiliary_loss_clip": 0.01613664, + "auxiliary_loss_mlp": 0.01506135, + "balance_loss_clip": 1.24067438, + "balance_loss_mlp": 1.13401127, + "epoch": 0.07394937774304094, + "flos": 16875772113600.0, + "grad_norm": 2.5781337208923825, + "language_loss": 0.75310296, + "learning_rate": 3.979823887684241e-06, + "loss": 0.78430092, + "num_input_tokens_seen": 13082640, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.71679688, + "step": 615, + "time_per_iteration": 3.085944414138794 + }, + { + "auxiliary_loss_clip": 0.01606113, + "auxiliary_loss_mlp": 0.01516247, + "balance_loss_clip": 1.23232579, + "balance_loss_mlp": 1.15156174, + "epoch": 0.07406962063368003, + "flos": 20705530466880.0, + "grad_norm": 3.093295778283457, + "language_loss": 0.85345006, + "learning_rate": 3.979713369733434e-06, + "loss": 0.88467366, + "num_input_tokens_seen": 13100505, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.64453125, + "step": 616, + "time_per_iteration": 3.00976300239563 + }, + { + "auxiliary_loss_clip": 0.01614838, + "auxiliary_loss_mlp": 0.01490796, + "balance_loss_clip": 1.24295473, + "balance_loss_mlp": 1.12172437, + "epoch": 0.07418986352431912, + "flos": 21432961933920.0, + "grad_norm": 2.0027076910102553, + "language_loss": 0.84946775, + "learning_rate": 3.979602551461683e-06, + "loss": 0.88052416, + "num_input_tokens_seen": 13121285, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.6875, + "step": 617, + "time_per_iteration": 2.9938135147094727 + }, + { + "auxiliary_loss_clip": 0.01600946, + "auxiliary_loss_mlp": 0.01499323, + "balance_loss_clip": 1.22615218, + "balance_loss_mlp": 1.13673639, + "epoch": 0.07431010641495822, + "flos": 12022777658880.0, + "grad_norm": 9.1859819752128, + "language_loss": 0.91725719, + "learning_rate": 3.979491432885799e-06, + "loss": 0.94825983, + "num_input_tokens_seen": 13137550, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.625, + "step": 618, + "time_per_iteration": 3.7344143390655518 + }, + { + "auxiliary_loss_clip": 0.01601797, + "auxiliary_loss_mlp": 0.01514803, + "balance_loss_clip": 1.22851157, + "balance_loss_mlp": 1.15221596, + "epoch": 0.0744303493055973, + "flos": 20959575832800.0, + "grad_norm": 9.118036861475657, + "language_loss": 0.8328532, + "learning_rate": 3.97938001402264e-06, + "loss": 0.86401916, + "num_input_tokens_seen": 13156675, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.62304688, + "step": 619, + "time_per_iteration": 2.887725591659546 + }, + { + "auxiliary_loss_clip": 0.01604427, + "auxiliary_loss_mlp": 0.01513407, + "balance_loss_clip": 1.23007846, + "balance_loss_mlp": 1.14719629, + "epoch": 0.0745505921962364, + "flos": 16254995725440.0, + "grad_norm": 3.211355793221256, + "language_loss": 0.8013823, + "learning_rate": 3.979268294889105e-06, + "loss": 0.83256066, + "num_input_tokens_seen": 13172225, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.66210938, + "step": 620, + "time_per_iteration": 2.9396188259124756 + }, + { + "auxiliary_loss_clip": 0.0160702, + "auxiliary_loss_mlp": 0.01510627, + "balance_loss_clip": 1.23198557, + "balance_loss_mlp": 1.14784968, + "epoch": 0.07467083508687548, + "flos": 50948078433120.0, + "grad_norm": 4.240569717006268, + "language_loss": 0.74174821, + "learning_rate": 3.979156275502143e-06, + "loss": 0.77292466, + "num_input_tokens_seen": 13195885, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.62695312, + "step": 621, + "time_per_iteration": 4.014603614807129 + }, + { + "auxiliary_loss_clip": 0.01601681, + "auxiliary_loss_mlp": 0.01501977, + "balance_loss_clip": 1.22546673, + "balance_loss_mlp": 1.14129698, + "epoch": 0.07479107797751458, + "flos": 17531594485920.0, + "grad_norm": 2.932305563668747, + "language_loss": 0.92073339, + "learning_rate": 3.979043955878749e-06, + "loss": 0.95177001, + "num_input_tokens_seen": 13213730, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.609375, + "step": 622, + "time_per_iteration": 3.749530076980591 + }, + { + "auxiliary_loss_clip": 0.01599072, + "auxiliary_loss_mlp": 0.01489666, + "balance_loss_clip": 1.22457147, + "balance_loss_mlp": 1.11639833, + "epoch": 0.07491132086815366, + "flos": 23476020602400.0, + "grad_norm": 2.499511731826671, + "language_loss": 0.83621585, + "learning_rate": 3.978931336035959e-06, + "loss": 0.86710322, + "num_input_tokens_seen": 13232540, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.72851562, + "step": 623, + "time_per_iteration": 3.779956102371216 + }, + { + "auxiliary_loss_clip": 0.01607728, + "auxiliary_loss_mlp": 0.01533623, + "balance_loss_clip": 1.23298657, + "balance_loss_mlp": 1.1662674, + "epoch": 0.07503156375879276, + "flos": 20159662923360.0, + "grad_norm": 4.602609324046284, + "language_loss": 0.82521373, + "learning_rate": 3.9788184159908595e-06, + "loss": 0.85662723, + "num_input_tokens_seen": 13249670, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.671875, + "step": 624, + "time_per_iteration": 2.994453191757202 + }, + { + "auxiliary_loss_clip": 0.01600336, + "auxiliary_loss_mlp": 0.01504457, + "balance_loss_clip": 1.22510695, + "balance_loss_mlp": 1.13099861, + "epoch": 0.07515180664943186, + "flos": 15116949990720.0, + "grad_norm": 4.286645003977118, + "language_loss": 0.82709312, + "learning_rate": 3.97870519576058e-06, + "loss": 0.85814106, + "num_input_tokens_seen": 13266095, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.73046875, + "step": 625, + "time_per_iteration": 2.8755693435668945 + }, + { + "auxiliary_loss_clip": 0.01602243, + "auxiliary_loss_mlp": 0.01496341, + "balance_loss_clip": 1.2283076, + "balance_loss_mlp": 1.12497997, + "epoch": 0.07527204954007094, + "flos": 21289819600800.0, + "grad_norm": 3.183557871091167, + "language_loss": 0.81339657, + "learning_rate": 3.978591675362295e-06, + "loss": 0.84438246, + "num_input_tokens_seen": 13284810, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.71289062, + "step": 626, + "time_per_iteration": 3.0039143562316895 + }, + { + "auxiliary_loss_clip": 0.01607066, + "auxiliary_loss_mlp": 0.01508659, + "balance_loss_clip": 1.23326111, + "balance_loss_mlp": 1.14931464, + "epoch": 0.07539229243071004, + "flos": 21326306855040.0, + "grad_norm": 1.9162900403254581, + "language_loss": 0.87743521, + "learning_rate": 3.978477854813226e-06, + "loss": 0.90859252, + "num_input_tokens_seen": 13304150, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.59570312, + "step": 627, + "time_per_iteration": 2.966366767883301 + }, + { + "auxiliary_loss_clip": 0.01606366, + "auxiliary_loss_mlp": 0.01488025, + "balance_loss_clip": 1.23124671, + "balance_loss_mlp": 1.1149478, + "epoch": 0.07551253532134912, + "flos": 13044231136800.0, + "grad_norm": 2.1161657552423354, + "language_loss": 0.825858, + "learning_rate": 3.97836373413064e-06, + "loss": 0.85680187, + "num_input_tokens_seen": 13322205, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.72851562, + "step": 628, + "time_per_iteration": 3.048631191253662 + }, + { + "auxiliary_loss_clip": 0.01596856, + "auxiliary_loss_mlp": 0.01484657, + "balance_loss_clip": 1.2214576, + "balance_loss_mlp": 1.10547578, + "epoch": 0.07563277821198822, + "flos": 19210804672320.0, + "grad_norm": 4.994531848994994, + "language_loss": 0.74783713, + "learning_rate": 3.978249313331848e-06, + "loss": 0.77865225, + "num_input_tokens_seen": 13340435, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.78710938, + "step": 629, + "time_per_iteration": 2.9261364936828613 + }, + { + "auxiliary_loss_clip": 0.01599854, + "auxiliary_loss_mlp": 0.01492655, + "balance_loss_clip": 1.22306097, + "balance_loss_mlp": 1.12262988, + "epoch": 0.07575302110262731, + "flos": 19539265816800.0, + "grad_norm": 3.244673289159214, + "language_loss": 0.61430514, + "learning_rate": 3.978134592434208e-06, + "loss": 0.64523029, + "num_input_tokens_seen": 13358185, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.69726562, + "step": 630, + "time_per_iteration": 2.9133381843566895 + }, + { + "auxiliary_loss_clip": 0.01640332, + "auxiliary_loss_mlp": 0.01466629, + "balance_loss_clip": 1.27690399, + "balance_loss_mlp": 1.11949158, + "epoch": 0.0758732639932664, + "flos": 67969078564320.0, + "grad_norm": 2.268280203769542, + "language_loss": 0.59249765, + "learning_rate": 3.978019571455123e-06, + "loss": 0.62356722, + "num_input_tokens_seen": 13410130, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 3.4765625, + "step": 631, + "time_per_iteration": 3.437682867050171 + }, + { + "auxiliary_loss_clip": 0.01606113, + "auxiliary_loss_mlp": 0.01512614, + "balance_loss_clip": 1.23090935, + "balance_loss_mlp": 1.13877344, + "epoch": 0.07599350688390549, + "flos": 18991198440000.0, + "grad_norm": 3.44804789354196, + "language_loss": 0.84298396, + "learning_rate": 3.977904250412042e-06, + "loss": 0.8741712, + "num_input_tokens_seen": 13429085, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.734375, + "step": 632, + "time_per_iteration": 2.9822638034820557 + }, + { + "auxiliary_loss_clip": 0.01613409, + "auxiliary_loss_mlp": 0.01485555, + "balance_loss_clip": 1.23860466, + "balance_loss_mlp": 1.10770953, + "epoch": 0.07611374977454458, + "flos": 21071123644320.0, + "grad_norm": 3.6216155927178004, + "language_loss": 0.86215556, + "learning_rate": 3.97778862932246e-06, + "loss": 0.8931452, + "num_input_tokens_seen": 13446250, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.77539062, + "step": 633, + "time_per_iteration": 2.9382052421569824 + }, + { + "auxiliary_loss_clip": 0.01607379, + "auxiliary_loss_mlp": 0.01494936, + "balance_loss_clip": 1.23200917, + "balance_loss_mlp": 1.12052333, + "epoch": 0.07623399266518367, + "flos": 18516484853280.0, + "grad_norm": 3.468719831673598, + "language_loss": 0.94560915, + "learning_rate": 3.9776727082039144e-06, + "loss": 0.9766323, + "num_input_tokens_seen": 13463220, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.74023438, + "step": 634, + "time_per_iteration": 3.035001754760742 + }, + { + "auxiliary_loss_clip": 0.01670688, + "auxiliary_loss_mlp": 0.01457535, + "balance_loss_clip": 1.3049289, + "balance_loss_mlp": 1.11726379, + "epoch": 0.07635423555582276, + "flos": 44667832744800.0, + "grad_norm": 0.8085659566114921, + "language_loss": 0.55417132, + "learning_rate": 3.977556487073991e-06, + "loss": 0.58545351, + "num_input_tokens_seen": 13517775, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.40625, + "step": 635, + "time_per_iteration": 3.3069021701812744 + }, + { + "auxiliary_loss_clip": 0.01609608, + "auxiliary_loss_mlp": 0.01513126, + "balance_loss_clip": 1.23559475, + "balance_loss_mlp": 1.13432693, + "epoch": 0.07647447844646185, + "flos": 21763243630080.0, + "grad_norm": 1.9769079493459554, + "language_loss": 0.81483209, + "learning_rate": 3.97743996595032e-06, + "loss": 0.84605944, + "num_input_tokens_seen": 13537815, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.78515625, + "step": 636, + "time_per_iteration": 2.935185194015503 + }, + { + "auxiliary_loss_clip": 0.01614378, + "auxiliary_loss_mlp": 0.01507099, + "balance_loss_clip": 1.23978877, + "balance_loss_mlp": 1.13898039, + "epoch": 0.07659472133710095, + "flos": 23809146910560.0, + "grad_norm": 15.67516825325162, + "language_loss": 0.81794238, + "learning_rate": 3.9773231448505804e-06, + "loss": 0.84915721, + "num_input_tokens_seen": 13559605, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.68164062, + "step": 637, + "time_per_iteration": 2.954828977584839 + }, + { + "auxiliary_loss_clip": 0.01609599, + "auxiliary_loss_mlp": 0.01495049, + "balance_loss_clip": 1.23625004, + "balance_loss_mlp": 1.12063599, + "epoch": 0.07671496422774003, + "flos": 21472104159360.0, + "grad_norm": 21.308074732797987, + "language_loss": 0.78248549, + "learning_rate": 3.977206023792491e-06, + "loss": 0.81353199, + "num_input_tokens_seen": 13579495, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.7421875, + "step": 638, + "time_per_iteration": 2.966454267501831 + }, + { + "auxiliary_loss_clip": 0.01613132, + "auxiliary_loss_mlp": 0.01497293, + "balance_loss_clip": 1.23882914, + "balance_loss_mlp": 1.1125809, + "epoch": 0.07683520711837913, + "flos": 16982768545920.0, + "grad_norm": 2.5426293012263574, + "language_loss": 0.81188703, + "learning_rate": 3.97708860279382e-06, + "loss": 0.84299123, + "num_input_tokens_seen": 13597605, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.84375, + "step": 639, + "time_per_iteration": 3.2240042686462402 + }, + { + "auxiliary_loss_clip": 0.01612015, + "auxiliary_loss_mlp": 0.01503646, + "balance_loss_clip": 1.2390362, + "balance_loss_mlp": 1.12808919, + "epoch": 0.07695545000901821, + "flos": 23478978998880.0, + "grad_norm": 2.1531887140402577, + "language_loss": 0.78729272, + "learning_rate": 3.97697088187238e-06, + "loss": 0.81844926, + "num_input_tokens_seen": 13618120, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.75390625, + "step": 640, + "time_per_iteration": 3.1754465103149414 + }, + { + "auxiliary_loss_clip": 0.01619747, + "auxiliary_loss_mlp": 0.01492315, + "balance_loss_clip": 1.2467339, + "balance_loss_mlp": 1.11370623, + "epoch": 0.07707569289965731, + "flos": 17635898018880.0, + "grad_norm": 3.59608180206574, + "language_loss": 0.92176008, + "learning_rate": 3.976852861046029e-06, + "loss": 0.95288074, + "num_input_tokens_seen": 13634735, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.78320312, + "step": 641, + "time_per_iteration": 2.9957902431488037 + }, + { + "auxiliary_loss_clip": 0.01614788, + "auxiliary_loss_mlp": 0.01496904, + "balance_loss_clip": 1.24396801, + "balance_loss_mlp": 1.11982083, + "epoch": 0.0771959357902964, + "flos": 25778055297600.0, + "grad_norm": 1.8493434341761723, + "language_loss": 0.80381811, + "learning_rate": 3.97673454033267e-06, + "loss": 0.83493501, + "num_input_tokens_seen": 13656835, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.76953125, + "step": 642, + "time_per_iteration": 3.0055220127105713 + }, + { + "auxiliary_loss_clip": 0.01606602, + "auxiliary_loss_mlp": 0.01494562, + "balance_loss_clip": 1.23502207, + "balance_loss_mlp": 1.12091243, + "epoch": 0.07731617868093549, + "flos": 19830708712800.0, + "grad_norm": 2.1034788177003185, + "language_loss": 0.83006346, + "learning_rate": 3.976615919750254e-06, + "loss": 0.86107516, + "num_input_tokens_seen": 13674535, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.73242188, + "step": 643, + "time_per_iteration": 2.9798550605773926 + }, + { + "auxiliary_loss_clip": 0.01611985, + "auxiliary_loss_mlp": 0.0148929, + "balance_loss_clip": 1.2395606, + "balance_loss_mlp": 1.10877383, + "epoch": 0.07743642157157458, + "flos": 21326799921120.0, + "grad_norm": 2.485431881594537, + "language_loss": 0.87222838, + "learning_rate": 3.976496999316775e-06, + "loss": 0.90324116, + "num_input_tokens_seen": 13693290, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.80273438, + "step": 644, + "time_per_iteration": 3.074167251586914 + }, + { + "auxiliary_loss_clip": 0.01614879, + "auxiliary_loss_mlp": 0.01498887, + "balance_loss_clip": 1.24278402, + "balance_loss_mlp": 1.12275815, + "epoch": 0.07755666446221367, + "flos": 19970892649440.0, + "grad_norm": 2.583404678542188, + "language_loss": 0.84182286, + "learning_rate": 3.976377779050271e-06, + "loss": 0.87296057, + "num_input_tokens_seen": 13711420, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.75976562, + "step": 645, + "time_per_iteration": 3.0270638465881348 + }, + { + "auxiliary_loss_clip": 0.01604714, + "auxiliary_loss_mlp": 0.01480317, + "balance_loss_clip": 1.23341346, + "balance_loss_mlp": 1.10132658, + "epoch": 0.07767690735285276, + "flos": 23625496938240.0, + "grad_norm": 2.7551995785999646, + "language_loss": 0.84759599, + "learning_rate": 3.976258258968831e-06, + "loss": 0.87844628, + "num_input_tokens_seen": 13729965, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.78515625, + "step": 646, + "time_per_iteration": 3.723961591720581 + }, + { + "auxiliary_loss_clip": 0.01611502, + "auxiliary_loss_mlp": 0.01478758, + "balance_loss_clip": 1.23995721, + "balance_loss_mlp": 1.10091186, + "epoch": 0.07779715024349185, + "flos": 22238412354720.0, + "grad_norm": 3.892454435334109, + "language_loss": 0.74012077, + "learning_rate": 3.976138439090583e-06, + "loss": 0.77102339, + "num_input_tokens_seen": 13748045, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.77539062, + "step": 647, + "time_per_iteration": 2.9621121883392334 + }, + { + "auxiliary_loss_clip": 0.0160797, + "auxiliary_loss_mlp": 0.01499356, + "balance_loss_clip": 1.23668718, + "balance_loss_mlp": 1.12189126, + "epoch": 0.07791739313413094, + "flos": 20956958789760.0, + "grad_norm": 2.655798205662039, + "language_loss": 0.84892422, + "learning_rate": 3.976018319433706e-06, + "loss": 0.87999749, + "num_input_tokens_seen": 13765590, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.77148438, + "step": 648, + "time_per_iteration": 3.829627752304077 + }, + { + "auxiliary_loss_clip": 0.01612795, + "auxiliary_loss_mlp": 0.01487716, + "balance_loss_clip": 1.2435745, + "balance_loss_mlp": 1.10643673, + "epoch": 0.07803763602477004, + "flos": 19314046216800.0, + "grad_norm": 3.2024855185351946, + "language_loss": 0.91652662, + "learning_rate": 3.9758979000164205e-06, + "loss": 0.94753182, + "num_input_tokens_seen": 13782410, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.81054688, + "step": 649, + "time_per_iteration": 2.8743624687194824 + }, + { + "auxiliary_loss_clip": 0.01615385, + "auxiliary_loss_mlp": 0.01492161, + "balance_loss_clip": 1.24502349, + "balance_loss_mlp": 1.11145449, + "epoch": 0.07815787891540912, + "flos": 22713012156960.0, + "grad_norm": 2.270961848122062, + "language_loss": 0.72181034, + "learning_rate": 3.975777180856995e-06, + "loss": 0.75288582, + "num_input_tokens_seen": 13801530, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 3.80273438, + "step": 650, + "time_per_iteration": 3.9291627407073975 + }, + { + "auxiliary_loss_clip": 0.01607153, + "auxiliary_loss_mlp": 0.0149392, + "balance_loss_clip": 1.23647404, + "balance_loss_mlp": 1.11359453, + "epoch": 0.07827812180604822, + "flos": 22713353510400.0, + "grad_norm": 4.729566125156994, + "language_loss": 0.85947073, + "learning_rate": 3.975656161973742e-06, + "loss": 0.89048147, + "num_input_tokens_seen": 13820615, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.79882812, + "step": 651, + "time_per_iteration": 2.970913887023926 + }, + { + "auxiliary_loss_clip": 0.01605764, + "auxiliary_loss_mlp": 0.01490028, + "balance_loss_clip": 1.2335093, + "balance_loss_mlp": 1.12057471, + "epoch": 0.0783983646966873, + "flos": 21727742508000.0, + "grad_norm": 3.971070357378723, + "language_loss": 0.88737023, + "learning_rate": 3.9755348433850194e-06, + "loss": 0.91832817, + "num_input_tokens_seen": 13835955, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.69335938, + "step": 652, + "time_per_iteration": 2.945131778717041 + }, + { + "auxiliary_loss_clip": 0.0166079, + "auxiliary_loss_mlp": 0.01451866, + "balance_loss_clip": 1.30384696, + "balance_loss_mlp": 1.10015106, + "epoch": 0.0785186075873264, + "flos": 60646405489920.0, + "grad_norm": 0.9828107607775898, + "language_loss": 0.63559651, + "learning_rate": 3.975413225109232e-06, + "loss": 0.66672301, + "num_input_tokens_seen": 13896505, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 3.5234375, + "step": 653, + "time_per_iteration": 3.4952197074890137 + }, + { + "auxiliary_loss_clip": 0.016076, + "auxiliary_loss_mlp": 0.01498212, + "balance_loss_clip": 1.23734844, + "balance_loss_mlp": 1.13104701, + "epoch": 0.0786388504779655, + "flos": 23880149154720.0, + "grad_norm": 4.308216236728757, + "language_loss": 0.93350458, + "learning_rate": 3.975291307164829e-06, + "loss": 0.96456265, + "num_input_tokens_seen": 13915150, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.671875, + "step": 654, + "time_per_iteration": 2.899425745010376 + }, + { + "auxiliary_loss_clip": 0.01614706, + "auxiliary_loss_mlp": 0.0149352, + "balance_loss_clip": 1.24544704, + "balance_loss_mlp": 1.12578321, + "epoch": 0.07875909336860458, + "flos": 15160491882720.0, + "grad_norm": 2.8783463539114265, + "language_loss": 0.85150021, + "learning_rate": 3.975169089570306e-06, + "loss": 0.88258243, + "num_input_tokens_seen": 13933525, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.67578125, + "step": 655, + "time_per_iteration": 2.9004039764404297 + }, + { + "auxiliary_loss_clip": 0.01614039, + "auxiliary_loss_mlp": 0.01506562, + "balance_loss_clip": 1.24301434, + "balance_loss_mlp": 1.13844419, + "epoch": 0.07887933625924368, + "flos": 22239057133440.0, + "grad_norm": 2.3717687409899826, + "language_loss": 0.91821241, + "learning_rate": 3.975046572344202e-06, + "loss": 0.94941849, + "num_input_tokens_seen": 13949985, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.67773438, + "step": 656, + "time_per_iteration": 2.912837505340576 + }, + { + "auxiliary_loss_clip": 0.01610382, + "auxiliary_loss_mlp": 0.01480776, + "balance_loss_clip": 1.24032474, + "balance_loss_mlp": 1.11227679, + "epoch": 0.07899957914988276, + "flos": 20779870389120.0, + "grad_norm": 2.4387215412785737, + "language_loss": 0.70929879, + "learning_rate": 3.974923755505103e-06, + "loss": 0.74021041, + "num_input_tokens_seen": 13969215, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.68359375, + "step": 657, + "time_per_iteration": 2.9975130558013916 + }, + { + "auxiliary_loss_clip": 0.01612676, + "auxiliary_loss_mlp": 0.01520442, + "balance_loss_clip": 1.240955, + "balance_loss_mlp": 1.16033423, + "epoch": 0.07911982204052186, + "flos": 23005289472480.0, + "grad_norm": 2.1900425434623263, + "language_loss": 0.91023052, + "learning_rate": 3.974800639071641e-06, + "loss": 0.94156164, + "num_input_tokens_seen": 13989935, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.60351562, + "step": 658, + "time_per_iteration": 2.938972234725952 + }, + { + "auxiliary_loss_clip": 0.0160516, + "auxiliary_loss_mlp": 0.01496646, + "balance_loss_clip": 1.23496461, + "balance_loss_mlp": 1.12547565, + "epoch": 0.07924006493116094, + "flos": 23113575462240.0, + "grad_norm": 3.4758718838403593, + "language_loss": 1.00633979, + "learning_rate": 3.974677223062492e-06, + "loss": 1.03735781, + "num_input_tokens_seen": 14007150, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.70898438, + "step": 659, + "time_per_iteration": 2.926795721054077 + }, + { + "auxiliary_loss_clip": 0.01603225, + "auxiliary_loss_mlp": 0.01494803, + "balance_loss_clip": 1.23149705, + "balance_loss_mlp": 1.12878275, + "epoch": 0.07936030782180004, + "flos": 16474033035360.0, + "grad_norm": 2.5612105955435447, + "language_loss": 0.74502003, + "learning_rate": 3.974553507496378e-06, + "loss": 0.77600038, + "num_input_tokens_seen": 14025725, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.65820312, + "step": 660, + "time_per_iteration": 2.9112393856048584 + }, + { + "auxiliary_loss_clip": 0.01600334, + "auxiliary_loss_mlp": 0.01533113, + "balance_loss_clip": 1.22901523, + "balance_loss_mlp": 1.173769, + "epoch": 0.07948055071243913, + "flos": 23735603479680.0, + "grad_norm": 2.2026002914359286, + "language_loss": 0.88945544, + "learning_rate": 3.974429492392068e-06, + "loss": 0.9207899, + "num_input_tokens_seen": 14045750, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.59179688, + "step": 661, + "time_per_iteration": 2.9406557083129883 + }, + { + "auxiliary_loss_clip": 0.01608061, + "auxiliary_loss_mlp": 0.01503087, + "balance_loss_clip": 1.23706102, + "balance_loss_mlp": 1.13687634, + "epoch": 0.07960079360307822, + "flos": 19575373789440.0, + "grad_norm": 2.2732392005051336, + "language_loss": 0.91074866, + "learning_rate": 3.974305177768373e-06, + "loss": 0.94186014, + "num_input_tokens_seen": 14063960, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.66015625, + "step": 662, + "time_per_iteration": 2.9151558876037598 + }, + { + "auxiliary_loss_clip": 0.01614958, + "auxiliary_loss_mlp": 0.01513639, + "balance_loss_clip": 1.24467468, + "balance_loss_mlp": 1.15181506, + "epoch": 0.07972103649371731, + "flos": 23515655893920.0, + "grad_norm": 2.9868253467122226, + "language_loss": 0.86649525, + "learning_rate": 3.974180563644152e-06, + "loss": 0.89778125, + "num_input_tokens_seen": 14082525, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.6171875, + "step": 663, + "time_per_iteration": 2.9430432319641113 + }, + { + "auxiliary_loss_clip": 0.01613414, + "auxiliary_loss_mlp": 0.01522697, + "balance_loss_clip": 1.24270296, + "balance_loss_mlp": 1.16087294, + "epoch": 0.0798412793843564, + "flos": 16728723180000.0, + "grad_norm": 2.579591473841493, + "language_loss": 0.89572358, + "learning_rate": 3.97405565003831e-06, + "loss": 0.92708468, + "num_input_tokens_seen": 14098610, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.61914062, + "step": 664, + "time_per_iteration": 2.905055284500122 + }, + { + "auxiliary_loss_clip": 0.01614733, + "auxiliary_loss_mlp": 0.0148908, + "balance_loss_clip": 1.24205077, + "balance_loss_mlp": 1.12267852, + "epoch": 0.07996152227499549, + "flos": 18225307454400.0, + "grad_norm": 2.1294211574767705, + "language_loss": 0.77877462, + "learning_rate": 3.973930436969794e-06, + "loss": 0.80981272, + "num_input_tokens_seen": 14117065, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.66210938, + "step": 665, + "time_per_iteration": 2.9639365673065186 + }, + { + "auxiliary_loss_clip": 0.01605334, + "auxiliary_loss_mlp": 0.01507228, + "balance_loss_clip": 1.23199546, + "balance_loss_mlp": 1.13338745, + "epoch": 0.08008176516563459, + "flos": 20596978980000.0, + "grad_norm": 2.713533586017355, + "language_loss": 0.8587842, + "learning_rate": 3.973804924457602e-06, + "loss": 0.88990986, + "num_input_tokens_seen": 14135145, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.73632812, + "step": 666, + "time_per_iteration": 2.9098644256591797 + }, + { + "auxiliary_loss_clip": 0.01610355, + "auxiliary_loss_mlp": 0.0150538, + "balance_loss_clip": 1.23815322, + "balance_loss_mlp": 1.13993239, + "epoch": 0.08020200805627367, + "flos": 31837139334720.0, + "grad_norm": 2.169030462977759, + "language_loss": 0.85413337, + "learning_rate": 3.973679112520771e-06, + "loss": 0.88529074, + "num_input_tokens_seen": 14156860, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.65234375, + "step": 667, + "time_per_iteration": 2.9757566452026367 + }, + { + "auxiliary_loss_clip": 0.01606307, + "auxiliary_loss_mlp": 0.01502983, + "balance_loss_clip": 1.2345717, + "balance_loss_mlp": 1.13467431, + "epoch": 0.08032225094691277, + "flos": 17785336426560.0, + "grad_norm": 2.205471508083889, + "language_loss": 0.98831236, + "learning_rate": 3.973553001178389e-06, + "loss": 1.01940525, + "num_input_tokens_seen": 14174365, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.68164062, + "step": 668, + "time_per_iteration": 3.0019290447235107 + }, + { + "auxiliary_loss_clip": 0.01610051, + "auxiliary_loss_mlp": 0.01494737, + "balance_loss_clip": 1.23800159, + "balance_loss_mlp": 1.12528396, + "epoch": 0.08044249383755185, + "flos": 24064026696000.0, + "grad_norm": 2.5209136645189045, + "language_loss": 0.75735676, + "learning_rate": 3.973426590449585e-06, + "loss": 0.7884047, + "num_input_tokens_seen": 14192320, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.69335938, + "step": 669, + "time_per_iteration": 2.906491756439209 + }, + { + "auxiliary_loss_clip": 0.01611545, + "auxiliary_loss_mlp": 0.01494115, + "balance_loss_clip": 1.23972523, + "balance_loss_mlp": 1.11932063, + "epoch": 0.08056273672819095, + "flos": 18225610879680.0, + "grad_norm": 1.993466007533219, + "language_loss": 0.75271708, + "learning_rate": 3.9732998803535364e-06, + "loss": 0.78377366, + "num_input_tokens_seen": 14210380, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.74414062, + "step": 670, + "time_per_iteration": 2.9270567893981934 + }, + { + "auxiliary_loss_clip": 0.0161053, + "auxiliary_loss_mlp": 0.01498491, + "balance_loss_clip": 1.23786569, + "balance_loss_mlp": 1.11644936, + "epoch": 0.08068297961883003, + "flos": 19678577405760.0, + "grad_norm": 2.873903791047651, + "language_loss": 0.85663152, + "learning_rate": 3.973172870909465e-06, + "loss": 0.88772172, + "num_input_tokens_seen": 14225145, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.81445312, + "step": 671, + "time_per_iteration": 2.902280569076538 + }, + { + "auxiliary_loss_clip": 0.016041, + "auxiliary_loss_mlp": 0.0149365, + "balance_loss_clip": 1.23075104, + "balance_loss_mlp": 1.10245252, + "epoch": 0.08080322250946913, + "flos": 23150972992320.0, + "grad_norm": 3.0237266861402206, + "language_loss": 0.81284297, + "learning_rate": 3.973045562136638e-06, + "loss": 0.84382045, + "num_input_tokens_seen": 14241960, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.90625, + "step": 672, + "time_per_iteration": 2.970181703567505 + }, + { + "auxiliary_loss_clip": 0.01615066, + "auxiliary_loss_mlp": 0.01470472, + "balance_loss_clip": 1.24167418, + "balance_loss_mlp": 1.08862138, + "epoch": 0.08092346540010822, + "flos": 21765860673120.0, + "grad_norm": 2.101397530840726, + "language_loss": 0.91626441, + "learning_rate": 3.972917954054368e-06, + "loss": 0.94711983, + "num_input_tokens_seen": 14260515, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.81640625, + "step": 673, + "time_per_iteration": 3.739281177520752 + }, + { + "auxiliary_loss_clip": 0.01617343, + "auxiliary_loss_mlp": 0.01500671, + "balance_loss_clip": 1.24334669, + "balance_loss_mlp": 1.12416029, + "epoch": 0.08104370829074731, + "flos": 21034598461920.0, + "grad_norm": 2.367077425178574, + "language_loss": 0.8202852, + "learning_rate": 3.972790046682013e-06, + "loss": 0.85146534, + "num_input_tokens_seen": 14279190, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.75976562, + "step": 674, + "time_per_iteration": 2.972008466720581 + }, + { + "auxiliary_loss_clip": 0.01598146, + "auxiliary_loss_mlp": 0.01464439, + "balance_loss_clip": 1.22463036, + "balance_loss_mlp": 1.0730511, + "epoch": 0.0811639511813864, + "flos": 20085209216640.0, + "grad_norm": 2.7627507348977205, + "language_loss": 0.79094613, + "learning_rate": 3.972661840038977e-06, + "loss": 0.82157195, + "num_input_tokens_seen": 14299480, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 3.90820312, + "step": 675, + "time_per_iteration": 2.9162416458129883 + }, + { + "auxiliary_loss_clip": 0.01608182, + "auxiliary_loss_mlp": 0.01490253, + "balance_loss_clip": 1.23605442, + "balance_loss_mlp": 1.10267985, + "epoch": 0.08128419407202549, + "flos": 16838602152480.0, + "grad_norm": 2.63776024820187, + "language_loss": 0.83558363, + "learning_rate": 3.972533334144707e-06, + "loss": 0.86656797, + "num_input_tokens_seen": 14316405, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.875, + "step": 676, + "time_per_iteration": 3.7213850021362305 + }, + { + "auxiliary_loss_clip": 0.0160622, + "auxiliary_loss_mlp": 0.01482099, + "balance_loss_clip": 1.23278987, + "balance_loss_mlp": 1.10387194, + "epoch": 0.08140443696266458, + "flos": 23771559739680.0, + "grad_norm": 3.7348766948869185, + "language_loss": 0.78318274, + "learning_rate": 3.972404529018699e-06, + "loss": 0.81406593, + "num_input_tokens_seen": 14336265, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.77929688, + "step": 677, + "time_per_iteration": 2.898247241973877 + }, + { + "auxiliary_loss_clip": 0.01599786, + "auxiliary_loss_mlp": 0.01488701, + "balance_loss_clip": 1.22820652, + "balance_loss_mlp": 1.11199999, + "epoch": 0.08152467985330367, + "flos": 24392487840480.0, + "grad_norm": 2.107285416372633, + "language_loss": 0.85601515, + "learning_rate": 3.972275424680493e-06, + "loss": 0.88690007, + "num_input_tokens_seen": 14356375, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.76171875, + "step": 678, + "time_per_iteration": 4.622374773025513 + }, + { + "auxiliary_loss_clip": 0.01602642, + "auxiliary_loss_mlp": 0.01481059, + "balance_loss_clip": 1.22961092, + "balance_loss_mlp": 1.0984453, + "epoch": 0.08164492274394276, + "flos": 19319773368960.0, + "grad_norm": 3.139345665811325, + "language_loss": 0.92010164, + "learning_rate": 3.972146021149673e-06, + "loss": 0.9509387, + "num_input_tokens_seen": 14374650, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.82421875, + "step": 679, + "time_per_iteration": 2.9434452056884766 + }, + { + "auxiliary_loss_clip": 0.01608002, + "auxiliary_loss_mlp": 0.01494538, + "balance_loss_clip": 1.23460984, + "balance_loss_mlp": 1.11077929, + "epoch": 0.08176516563458186, + "flos": 14832258307200.0, + "grad_norm": 2.464249910897245, + "language_loss": 0.7875843, + "learning_rate": 3.972016318445868e-06, + "loss": 0.81860971, + "num_input_tokens_seen": 14392650, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.83007812, + "step": 680, + "time_per_iteration": 2.8689041137695312 + }, + { + "auxiliary_loss_clip": 0.01595797, + "auxiliary_loss_mlp": 0.01497033, + "balance_loss_clip": 1.22264171, + "balance_loss_mlp": 1.11918688, + "epoch": 0.08188540852522094, + "flos": 22604574454560.0, + "grad_norm": 2.2663411670464186, + "language_loss": 0.92499846, + "learning_rate": 3.971886316588757e-06, + "loss": 0.95592678, + "num_input_tokens_seen": 14413155, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.77539062, + "step": 681, + "time_per_iteration": 3.0422306060791016 + }, + { + "auxiliary_loss_clip": 0.01604774, + "auxiliary_loss_mlp": 0.01496065, + "balance_loss_clip": 1.23020947, + "balance_loss_mlp": 1.12298775, + "epoch": 0.08200565141586004, + "flos": 19465760314080.0, + "grad_norm": 3.7032158659578975, + "language_loss": 0.73231179, + "learning_rate": 3.9717560155980595e-06, + "loss": 0.76332015, + "num_input_tokens_seen": 14428805, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.7265625, + "step": 682, + "time_per_iteration": 3.007896661758423 + }, + { + "auxiliary_loss_clip": 0.01594538, + "auxiliary_loss_mlp": 0.01488797, + "balance_loss_clip": 1.22141492, + "balance_loss_mlp": 1.11343062, + "epoch": 0.08212589430649912, + "flos": 20596751411040.0, + "grad_norm": 2.306714636786557, + "language_loss": 0.91912448, + "learning_rate": 3.971625415493542e-06, + "loss": 0.94995779, + "num_input_tokens_seen": 14447125, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.74804688, + "step": 683, + "time_per_iteration": 2.913628578186035 + }, + { + "auxiliary_loss_clip": 0.01601488, + "auxiliary_loss_mlp": 0.01492825, + "balance_loss_clip": 1.22865105, + "balance_loss_mlp": 1.11211777, + "epoch": 0.08224613719713822, + "flos": 25955750548800.0, + "grad_norm": 2.2171797932209687, + "language_loss": 0.87412667, + "learning_rate": 3.971494516295017e-06, + "loss": 0.90506983, + "num_input_tokens_seen": 14466575, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.8046875, + "step": 684, + "time_per_iteration": 3.0354676246643066 + }, + { + "auxiliary_loss_clip": 0.01606617, + "auxiliary_loss_mlp": 0.01494352, + "balance_loss_clip": 1.23417151, + "balance_loss_mlp": 1.11841321, + "epoch": 0.08236638008777732, + "flos": 23770990817280.0, + "grad_norm": 1.89833427103477, + "language_loss": 0.85185379, + "learning_rate": 3.971363318022341e-06, + "loss": 0.88286346, + "num_input_tokens_seen": 14487915, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.7578125, + "step": 685, + "time_per_iteration": 2.9998080730438232 + }, + { + "auxiliary_loss_clip": 0.01595204, + "auxiliary_loss_mlp": 0.01495165, + "balance_loss_clip": 1.22063649, + "balance_loss_mlp": 1.12399507, + "epoch": 0.0824866229784164, + "flos": 38802146581440.0, + "grad_norm": 2.1704884205207575, + "language_loss": 0.6863631, + "learning_rate": 3.971231820695417e-06, + "loss": 0.7172668, + "num_input_tokens_seen": 14511530, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.70898438, + "step": 686, + "time_per_iteration": 3.045295476913452 + }, + { + "auxiliary_loss_clip": 0.01602799, + "auxiliary_loss_mlp": 0.01492814, + "balance_loss_clip": 1.23061538, + "balance_loss_mlp": 1.12164402, + "epoch": 0.0826068658690555, + "flos": 23109744718080.0, + "grad_norm": 2.1656769123603956, + "language_loss": 0.81558901, + "learning_rate": 3.971100024334193e-06, + "loss": 0.84654522, + "num_input_tokens_seen": 14529050, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.70703125, + "step": 687, + "time_per_iteration": 2.929691791534424 + }, + { + "auxiliary_loss_clip": 0.01595821, + "auxiliary_loss_mlp": 0.01499995, + "balance_loss_clip": 1.22112966, + "balance_loss_mlp": 1.1307323, + "epoch": 0.08272710875969458, + "flos": 21138181359840.0, + "grad_norm": 2.4831907979292334, + "language_loss": 0.86710685, + "learning_rate": 3.970967928958663e-06, + "loss": 0.89806497, + "num_input_tokens_seen": 14546165, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.68945312, + "step": 688, + "time_per_iteration": 2.8852179050445557 + }, + { + "auxiliary_loss_clip": 0.01600533, + "auxiliary_loss_mlp": 0.01493165, + "balance_loss_clip": 1.22836328, + "balance_loss_mlp": 1.12065983, + "epoch": 0.08284735165033368, + "flos": 19065045296160.0, + "grad_norm": 1.9085832644941647, + "language_loss": 0.83651048, + "learning_rate": 3.970835534588865e-06, + "loss": 0.8674475, + "num_input_tokens_seen": 14563660, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.72460938, + "step": 689, + "time_per_iteration": 2.9528796672821045 + }, + { + "auxiliary_loss_clip": 0.01608858, + "auxiliary_loss_mlp": 0.01509522, + "balance_loss_clip": 1.23591113, + "balance_loss_mlp": 1.14140368, + "epoch": 0.08296759454097276, + "flos": 16729330030560.0, + "grad_norm": 1.9129011484601983, + "language_loss": 0.85946953, + "learning_rate": 3.970702841244883e-06, + "loss": 0.89065337, + "num_input_tokens_seen": 14581980, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.6796875, + "step": 690, + "time_per_iteration": 2.8733232021331787 + }, + { + "auxiliary_loss_clip": 0.01598971, + "auxiliary_loss_mlp": 0.0149999, + "balance_loss_clip": 1.22592211, + "balance_loss_mlp": 1.13358843, + "epoch": 0.08308783743161186, + "flos": 18006914923200.0, + "grad_norm": 1.950589115871694, + "language_loss": 0.82641274, + "learning_rate": 3.970569848946847e-06, + "loss": 0.85740238, + "num_input_tokens_seen": 14601795, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.6640625, + "step": 691, + "time_per_iteration": 3.1179747581481934 + }, + { + "auxiliary_loss_clip": 0.01603633, + "auxiliary_loss_mlp": 0.01495386, + "balance_loss_clip": 1.22972238, + "balance_loss_mlp": 1.13222682, + "epoch": 0.08320808032225095, + "flos": 15081259227840.0, + "grad_norm": 2.221616255188275, + "language_loss": 0.82988542, + "learning_rate": 3.970436557714932e-06, + "loss": 0.86087561, + "num_input_tokens_seen": 14618315, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.63085938, + "step": 692, + "time_per_iteration": 2.913670301437378 + }, + { + "auxiliary_loss_clip": 0.01592387, + "auxiliary_loss_mlp": 0.01485529, + "balance_loss_clip": 1.21751499, + "balance_loss_mlp": 1.10959053, + "epoch": 0.08332832321289003, + "flos": 22385309575680.0, + "grad_norm": 2.256362123101141, + "language_loss": 0.86472547, + "learning_rate": 3.970302967569358e-06, + "loss": 0.89550459, + "num_input_tokens_seen": 14636905, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.75390625, + "step": 693, + "time_per_iteration": 2.99432373046875 + }, + { + "auxiliary_loss_clip": 0.0160015, + "auxiliary_loss_mlp": 0.01486984, + "balance_loss_clip": 1.22543025, + "balance_loss_mlp": 1.1202004, + "epoch": 0.08344856610352913, + "flos": 24719697355680.0, + "grad_norm": 136.77345152291275, + "language_loss": 0.68393564, + "learning_rate": 3.9701690785303896e-06, + "loss": 0.71480697, + "num_input_tokens_seen": 14656100, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.66601562, + "step": 694, + "time_per_iteration": 3.1119229793548584 + }, + { + "auxiliary_loss_clip": 0.01598691, + "auxiliary_loss_mlp": 0.01512669, + "balance_loss_clip": 1.22479033, + "balance_loss_mlp": 1.15637589, + "epoch": 0.08356880899416821, + "flos": 25372333762560.0, + "grad_norm": 2.655400209630974, + "language_loss": 0.8826766, + "learning_rate": 3.970034890618339e-06, + "loss": 0.91379023, + "num_input_tokens_seen": 14675790, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.56640625, + "step": 695, + "time_per_iteration": 2.901715040206909 + }, + { + "auxiliary_loss_clip": 0.01599434, + "auxiliary_loss_mlp": 0.01496334, + "balance_loss_clip": 1.22548437, + "balance_loss_mlp": 1.13679922, + "epoch": 0.08368905188480731, + "flos": 24355393735680.0, + "grad_norm": 2.2874396275886024, + "language_loss": 0.88741988, + "learning_rate": 3.969900403853562e-06, + "loss": 0.91837752, + "num_input_tokens_seen": 14694830, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.59765625, + "step": 696, + "time_per_iteration": 2.9793126583099365 + }, + { + "auxiliary_loss_clip": 0.01603869, + "auxiliary_loss_mlp": 0.01511272, + "balance_loss_clip": 1.22834039, + "balance_loss_mlp": 1.14887619, + "epoch": 0.08380929477544641, + "flos": 18039078367200.0, + "grad_norm": 2.3693378616809837, + "language_loss": 0.7835691, + "learning_rate": 3.96976561825646e-06, + "loss": 0.81472051, + "num_input_tokens_seen": 14711920, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.62304688, + "step": 697, + "time_per_iteration": 3.0440590381622314 + }, + { + "auxiliary_loss_clip": 0.01598863, + "auxiliary_loss_mlp": 0.01492986, + "balance_loss_clip": 1.2255379, + "balance_loss_mlp": 1.12620282, + "epoch": 0.08392953766608549, + "flos": 26288687216160.0, + "grad_norm": 2.249364211137658, + "language_loss": 0.87307119, + "learning_rate": 3.969630533847479e-06, + "loss": 0.90398967, + "num_input_tokens_seen": 14730880, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.66210938, + "step": 698, + "time_per_iteration": 2.9489147663116455 + }, + { + "auxiliary_loss_clip": 0.01598097, + "auxiliary_loss_mlp": 0.01491946, + "balance_loss_clip": 1.22421408, + "balance_loss_mlp": 1.11734235, + "epoch": 0.08404978055672459, + "flos": 22494354128640.0, + "grad_norm": 2.7768019394575036, + "language_loss": 0.8427676, + "learning_rate": 3.969495150647113e-06, + "loss": 0.87366807, + "num_input_tokens_seen": 14749050, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.74414062, + "step": 699, + "time_per_iteration": 2.927212715148926 + }, + { + "auxiliary_loss_clip": 0.01606512, + "auxiliary_loss_mlp": 0.01492757, + "balance_loss_clip": 1.23165107, + "balance_loss_mlp": 1.122159, + "epoch": 0.08417002344736367, + "flos": 24829197046560.0, + "grad_norm": 2.2924071535673973, + "language_loss": 0.76578164, + "learning_rate": 3.969359468675899e-06, + "loss": 0.79677427, + "num_input_tokens_seen": 14769180, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.703125, + "step": 700, + "time_per_iteration": 2.916300058364868 + }, + { + "auxiliary_loss_clip": 0.01601131, + "auxiliary_loss_mlp": 0.01486028, + "balance_loss_clip": 1.22503722, + "balance_loss_mlp": 1.10875404, + "epoch": 0.08429026633800277, + "flos": 16947798418080.0, + "grad_norm": 2.3324987926848886, + "language_loss": 0.89750051, + "learning_rate": 3.969223487954418e-06, + "loss": 0.92837214, + "num_input_tokens_seen": 14786640, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.76953125, + "step": 701, + "time_per_iteration": 2.889130115509033 + }, + { + "auxiliary_loss_clip": 0.01603925, + "auxiliary_loss_mlp": 0.01488396, + "balance_loss_clip": 1.22968173, + "balance_loss_mlp": 1.12256658, + "epoch": 0.08441050922864185, + "flos": 23844192894720.0, + "grad_norm": 2.211402503967584, + "language_loss": 0.82765758, + "learning_rate": 3.969087208503301e-06, + "loss": 0.85858071, + "num_input_tokens_seen": 14806720, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.65625, + "step": 702, + "time_per_iteration": 3.796218156814575 + }, + { + "auxiliary_loss_clip": 0.01597651, + "auxiliary_loss_mlp": 0.01491454, + "balance_loss_clip": 1.22203469, + "balance_loss_mlp": 1.11971188, + "epoch": 0.08453075211928095, + "flos": 25522568661600.0, + "grad_norm": 3.398655591284612, + "language_loss": 0.8446697, + "learning_rate": 3.968950630343219e-06, + "loss": 0.8755607, + "num_input_tokens_seen": 14823705, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.71484375, + "step": 703, + "time_per_iteration": 2.9521565437316895 + }, + { + "auxiliary_loss_clip": 0.01590432, + "auxiliary_loss_mlp": 0.01489293, + "balance_loss_clip": 1.21526682, + "balance_loss_mlp": 1.11297345, + "epoch": 0.08465099500992004, + "flos": 19534297227840.0, + "grad_norm": 2.1653721747837626, + "language_loss": 0.9350642, + "learning_rate": 3.968813753494892e-06, + "loss": 0.96586144, + "num_input_tokens_seen": 14841865, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.75976562, + "step": 704, + "time_per_iteration": 3.8563365936279297 + }, + { + "auxiliary_loss_clip": 0.01596108, + "auxiliary_loss_mlp": 0.01508968, + "balance_loss_clip": 1.22057211, + "balance_loss_mlp": 1.13684416, + "epoch": 0.08477123790055913, + "flos": 29353882069440.0, + "grad_norm": 2.7544196804811167, + "language_loss": 0.75489873, + "learning_rate": 3.968676577979084e-06, + "loss": 0.78594947, + "num_input_tokens_seen": 14861415, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.71875, + "step": 705, + "time_per_iteration": 3.809011936187744 + }, + { + "auxiliary_loss_clip": 0.01602819, + "auxiliary_loss_mlp": 0.01503179, + "balance_loss_clip": 1.2285192, + "balance_loss_mlp": 1.13525164, + "epoch": 0.08489148079119822, + "flos": 18626477610240.0, + "grad_norm": 2.856245355363491, + "language_loss": 0.78248751, + "learning_rate": 3.968539103816605e-06, + "loss": 0.81354749, + "num_input_tokens_seen": 14879215, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.67382812, + "step": 706, + "time_per_iteration": 3.6904115676879883 + }, + { + "auxiliary_loss_clip": 0.01599934, + "auxiliary_loss_mlp": 0.01496732, + "balance_loss_clip": 1.22398508, + "balance_loss_mlp": 1.12556219, + "epoch": 0.0850117236818373, + "flos": 23473327703040.0, + "grad_norm": 2.0414305306468683, + "language_loss": 0.89637852, + "learning_rate": 3.9684013310283085e-06, + "loss": 0.92734528, + "num_input_tokens_seen": 14897900, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.70898438, + "step": 707, + "time_per_iteration": 3.1379685401916504 + }, + { + "auxiliary_loss_clip": 0.01599929, + "auxiliary_loss_mlp": 0.01485415, + "balance_loss_clip": 1.22582698, + "balance_loss_mlp": 1.11004913, + "epoch": 0.0851319665724764, + "flos": 40628253988800.0, + "grad_norm": 1.8924713498650565, + "language_loss": 0.64661896, + "learning_rate": 3.9682632596350956e-06, + "loss": 0.67747247, + "num_input_tokens_seen": 14919065, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.75195312, + "step": 708, + "time_per_iteration": 3.257740020751953 + }, + { + "auxiliary_loss_clip": 0.01593874, + "auxiliary_loss_mlp": 0.01492559, + "balance_loss_clip": 1.21760678, + "balance_loss_mlp": 1.11681128, + "epoch": 0.0852522094631155, + "flos": 15880906640160.0, + "grad_norm": 2.678434310155099, + "language_loss": 0.78658402, + "learning_rate": 3.968124889657911e-06, + "loss": 0.81744838, + "num_input_tokens_seen": 14934165, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.75585938, + "step": 709, + "time_per_iteration": 3.007093906402588 + }, + { + "auxiliary_loss_clip": 0.0159448, + "auxiliary_loss_mlp": 0.01489045, + "balance_loss_clip": 1.2199614, + "balance_loss_mlp": 1.10910082, + "epoch": 0.08537245235375458, + "flos": 14568427476000.0, + "grad_norm": 2.4267979980151306, + "language_loss": 0.90491593, + "learning_rate": 3.967986221117746e-06, + "loss": 0.93575114, + "num_input_tokens_seen": 14950105, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.79492188, + "step": 710, + "time_per_iteration": 3.005725383758545 + }, + { + "auxiliary_loss_clip": 0.01597655, + "auxiliary_loss_mlp": 0.01490257, + "balance_loss_clip": 1.22286093, + "balance_loss_mlp": 1.12004089, + "epoch": 0.08549269524439368, + "flos": 26471351056320.0, + "grad_norm": 2.172421959819856, + "language_loss": 0.86669904, + "learning_rate": 3.967847254035635e-06, + "loss": 0.89757812, + "num_input_tokens_seen": 14969490, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.69726562, + "step": 711, + "time_per_iteration": 3.0201776027679443 + }, + { + "auxiliary_loss_clip": 0.01596553, + "auxiliary_loss_mlp": 0.01487475, + "balance_loss_clip": 1.2218864, + "balance_loss_mlp": 1.10791206, + "epoch": 0.08561293813503276, + "flos": 13591957160160.0, + "grad_norm": 2.278614868209407, + "language_loss": 0.86378092, + "learning_rate": 3.967707988432661e-06, + "loss": 0.89462113, + "num_input_tokens_seen": 14987195, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.79296875, + "step": 712, + "time_per_iteration": 2.874831199645996 + }, + { + "auxiliary_loss_clip": 0.01590339, + "auxiliary_loss_mlp": 0.01493219, + "balance_loss_clip": 1.21550298, + "balance_loss_mlp": 1.10926986, + "epoch": 0.08573318102567186, + "flos": 26945988786720.0, + "grad_norm": 3.230607507426697, + "language_loss": 0.88181901, + "learning_rate": 3.967568424329949e-06, + "loss": 0.91265458, + "num_input_tokens_seen": 15007620, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.83789062, + "step": 713, + "time_per_iteration": 3.0422306060791016 + }, + { + "auxiliary_loss_clip": 0.01695634, + "auxiliary_loss_mlp": 0.0143782, + "balance_loss_clip": 1.3215822, + "balance_loss_mlp": 1.10365295, + "epoch": 0.08585342391631094, + "flos": 67309842657600.0, + "grad_norm": 0.847507571056779, + "language_loss": 0.55525267, + "learning_rate": 3.967428561748671e-06, + "loss": 0.58658725, + "num_input_tokens_seen": 15075590, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.34375, + "step": 714, + "time_per_iteration": 3.5954620838165283 + }, + { + "auxiliary_loss_clip": 0.01588641, + "auxiliary_loss_mlp": 0.01499705, + "balance_loss_clip": 1.2129072, + "balance_loss_mlp": 1.11499274, + "epoch": 0.08597366680695004, + "flos": 22458853006560.0, + "grad_norm": 2.2824688370154673, + "language_loss": 0.87798446, + "learning_rate": 3.967288400710045e-06, + "loss": 0.9088679, + "num_input_tokens_seen": 15095055, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.84765625, + "step": 715, + "time_per_iteration": 2.901688814163208 + }, + { + "auxiliary_loss_clip": 0.0160132, + "auxiliary_loss_mlp": 0.01483845, + "balance_loss_clip": 1.22607744, + "balance_loss_mlp": 1.1033287, + "epoch": 0.08609390969758914, + "flos": 23552787926880.0, + "grad_norm": 3.7593448147604063, + "language_loss": 0.88807309, + "learning_rate": 3.9671479412353335e-06, + "loss": 0.91892475, + "num_input_tokens_seen": 15113520, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.80078125, + "step": 716, + "time_per_iteration": 2.936818838119507 + }, + { + "auxiliary_loss_clip": 0.01589901, + "auxiliary_loss_mlp": 0.01474104, + "balance_loss_clip": 1.21432948, + "balance_loss_mlp": 1.09587693, + "epoch": 0.08621415258822822, + "flos": 25888275623520.0, + "grad_norm": 3.5372906503827304, + "language_loss": 0.74323058, + "learning_rate": 3.967007183345843e-06, + "loss": 0.77387059, + "num_input_tokens_seen": 15133375, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.77929688, + "step": 717, + "time_per_iteration": 2.951188087463379 + }, + { + "auxiliary_loss_clip": 0.01599116, + "auxiliary_loss_mlp": 0.01483324, + "balance_loss_clip": 1.22296166, + "balance_loss_mlp": 1.10528755, + "epoch": 0.08633439547886732, + "flos": 13591274453280.0, + "grad_norm": 4.564646226596592, + "language_loss": 0.89541745, + "learning_rate": 3.966866127062927e-06, + "loss": 0.92624187, + "num_input_tokens_seen": 15150500, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.77539062, + "step": 718, + "time_per_iteration": 2.9340767860412598 + }, + { + "auxiliary_loss_clip": 0.01687221, + "auxiliary_loss_mlp": 0.0143158, + "balance_loss_clip": 1.31068206, + "balance_loss_mlp": 1.08062744, + "epoch": 0.0864546383695064, + "flos": 57773204804160.0, + "grad_norm": 0.9063596431775319, + "language_loss": 0.6271292, + "learning_rate": 3.966724772407982e-06, + "loss": 0.65831721, + "num_input_tokens_seen": 15208015, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.515625, + "step": 719, + "time_per_iteration": 3.393970489501953 + }, + { + "auxiliary_loss_clip": 0.01598037, + "auxiliary_loss_mlp": 0.01493482, + "balance_loss_clip": 1.22281969, + "balance_loss_mlp": 1.11220264, + "epoch": 0.0865748812601455, + "flos": 20048987459520.0, + "grad_norm": 2.7538666221163757, + "language_loss": 0.8882159, + "learning_rate": 3.966583119402454e-06, + "loss": 0.91913104, + "num_input_tokens_seen": 15224780, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.81054688, + "step": 720, + "time_per_iteration": 2.99977445602417 + }, + { + "auxiliary_loss_clip": 0.01592887, + "auxiliary_loss_mlp": 0.01500589, + "balance_loss_clip": 1.21783185, + "balance_loss_mlp": 1.12255216, + "epoch": 0.08669512415078459, + "flos": 35265386178720.0, + "grad_norm": 2.057679625552229, + "language_loss": 0.8223722, + "learning_rate": 3.9664411680678305e-06, + "loss": 0.85330695, + "num_input_tokens_seen": 15246535, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.77929688, + "step": 721, + "time_per_iteration": 3.1767737865448 + }, + { + "auxiliary_loss_clip": 0.01677392, + "auxiliary_loss_mlp": 0.0142701, + "balance_loss_clip": 1.30083871, + "balance_loss_mlp": 1.08368683, + "epoch": 0.08681536704142367, + "flos": 65661544285920.0, + "grad_norm": 0.8510128504606547, + "language_loss": 0.61391914, + "learning_rate": 3.966298918425644e-06, + "loss": 0.6449632, + "num_input_tokens_seen": 15304025, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.4375, + "step": 722, + "time_per_iteration": 3.3221020698547363 + }, + { + "auxiliary_loss_clip": 0.01592484, + "auxiliary_loss_mlp": 0.01498933, + "balance_loss_clip": 1.21528137, + "balance_loss_mlp": 1.1250931, + "epoch": 0.08693560993206277, + "flos": 34532682697440.0, + "grad_norm": 4.035010622804573, + "language_loss": 0.82775956, + "learning_rate": 3.966156370497476e-06, + "loss": 0.85867381, + "num_input_tokens_seen": 15327635, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 3.734375, + "step": 723, + "time_per_iteration": 3.2505571842193604 + }, + { + "auxiliary_loss_clip": 0.01595047, + "auxiliary_loss_mlp": 0.01519282, + "balance_loss_clip": 1.21913362, + "balance_loss_mlp": 1.14849377, + "epoch": 0.08705585282270185, + "flos": 23151693627360.0, + "grad_norm": 1.9627883300265985, + "language_loss": 0.88802707, + "learning_rate": 3.96601352430495e-06, + "loss": 0.91917038, + "num_input_tokens_seen": 15347405, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.70507812, + "step": 724, + "time_per_iteration": 3.1661064624786377 + }, + { + "auxiliary_loss_clip": 0.01599407, + "auxiliary_loss_mlp": 0.014795, + "balance_loss_clip": 1.22377574, + "balance_loss_mlp": 1.10985601, + "epoch": 0.08717609571334095, + "flos": 29500210368000.0, + "grad_norm": 1.7389494310864313, + "language_loss": 0.83334208, + "learning_rate": 3.965870379869735e-06, + "loss": 0.86413109, + "num_input_tokens_seen": 15369450, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.6953125, + "step": 725, + "time_per_iteration": 3.023346185684204 + }, + { + "auxiliary_loss_clip": 0.01589249, + "auxiliary_loss_mlp": 0.01513901, + "balance_loss_clip": 1.21440232, + "balance_loss_mlp": 1.1476903, + "epoch": 0.08729633860398003, + "flos": 20669574206880.0, + "grad_norm": 2.348096641833039, + "language_loss": 0.86993301, + "learning_rate": 3.965726937213547e-06, + "loss": 0.9009645, + "num_input_tokens_seen": 15388085, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.66015625, + "step": 726, + "time_per_iteration": 3.0539655685424805 + }, + { + "auxiliary_loss_clip": 0.0158755, + "auxiliary_loss_mlp": 0.01504934, + "balance_loss_clip": 1.2112478, + "balance_loss_mlp": 1.13452744, + "epoch": 0.08741658149461913, + "flos": 18371484040320.0, + "grad_norm": 3.2039472928209562, + "language_loss": 0.81142759, + "learning_rate": 3.965583196358144e-06, + "loss": 0.84235251, + "num_input_tokens_seen": 15407120, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.70117188, + "step": 727, + "time_per_iteration": 2.9896914958953857 + }, + { + "auxiliary_loss_clip": 0.01587409, + "auxiliary_loss_mlp": 0.01491469, + "balance_loss_clip": 1.21253359, + "balance_loss_mlp": 1.11877275, + "epoch": 0.08753682438525823, + "flos": 18731425921920.0, + "grad_norm": 2.5651055951051878, + "language_loss": 0.74654037, + "learning_rate": 3.965439157325335e-06, + "loss": 0.77732915, + "num_input_tokens_seen": 15424485, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.72265625, + "step": 728, + "time_per_iteration": 3.038221836090088 + }, + { + "auxiliary_loss_clip": 0.01584439, + "auxiliary_loss_mlp": 0.01492035, + "balance_loss_clip": 1.20829523, + "balance_loss_mlp": 1.1210556, + "epoch": 0.08765706727589731, + "flos": 27778254780960.0, + "grad_norm": 1.9925148866001892, + "language_loss": 0.76147139, + "learning_rate": 3.965294820136968e-06, + "loss": 0.79223609, + "num_input_tokens_seen": 15446285, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.70703125, + "step": 729, + "time_per_iteration": 3.0692241191864014 + }, + { + "auxiliary_loss_clip": 0.01587724, + "auxiliary_loss_mlp": 0.01528105, + "balance_loss_clip": 1.21205759, + "balance_loss_mlp": 1.16551805, + "epoch": 0.08777731016653641, + "flos": 24391425852000.0, + "grad_norm": 2.3996200820412166, + "language_loss": 0.87337112, + "learning_rate": 3.965150184814938e-06, + "loss": 0.90452945, + "num_input_tokens_seen": 15465770, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.62890625, + "step": 730, + "time_per_iteration": 3.744082450866699 + }, + { + "auxiliary_loss_clip": 0.01590743, + "auxiliary_loss_mlp": 0.01522218, + "balance_loss_clip": 1.21466696, + "balance_loss_mlp": 1.16649795, + "epoch": 0.08789755305717549, + "flos": 21984063563520.0, + "grad_norm": 3.024634748274227, + "language_loss": 0.76668245, + "learning_rate": 3.965005251381189e-06, + "loss": 0.7978121, + "num_input_tokens_seen": 15483705, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.55664062, + "step": 731, + "time_per_iteration": 2.9869532585144043 + }, + { + "auxiliary_loss_clip": 0.01649696, + "auxiliary_loss_mlp": 0.01529205, + "balance_loss_clip": 1.27171588, + "balance_loss_mlp": 1.21792603, + "epoch": 0.08801779594781459, + "flos": 58366520840160.0, + "grad_norm": 0.9696673403004431, + "language_loss": 0.64616084, + "learning_rate": 3.964860019857705e-06, + "loss": 0.67794985, + "num_input_tokens_seen": 15548620, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.109375, + "step": 732, + "time_per_iteration": 4.446175813674927 + }, + { + "auxiliary_loss_clip": 0.01594927, + "auxiliary_loss_mlp": 0.01485839, + "balance_loss_clip": 1.22184908, + "balance_loss_mlp": 1.11180842, + "epoch": 0.08813803883845367, + "flos": 23297035793760.0, + "grad_norm": 2.2424131746477896, + "language_loss": 0.8424983, + "learning_rate": 3.964714490266518e-06, + "loss": 0.87330604, + "num_input_tokens_seen": 15569265, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.73632812, + "step": 733, + "time_per_iteration": 4.6349310874938965 + }, + { + "auxiliary_loss_clip": 0.01642177, + "auxiliary_loss_mlp": 0.01424202, + "balance_loss_clip": 1.26429641, + "balance_loss_mlp": 1.09003448, + "epoch": 0.08825828172909277, + "flos": 63431232469920.0, + "grad_norm": 0.8987640295599924, + "language_loss": 0.64551973, + "learning_rate": 3.964568662629706e-06, + "loss": 0.67618358, + "num_input_tokens_seen": 15630570, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.34375, + "step": 734, + "time_per_iteration": 3.3288824558258057 + }, + { + "auxiliary_loss_clip": 0.01587731, + "auxiliary_loss_mlp": 0.01486555, + "balance_loss_clip": 1.21202588, + "balance_loss_mlp": 1.10775518, + "epoch": 0.08837852461973186, + "flos": 26723234517120.0, + "grad_norm": 2.3559877075065816, + "language_loss": 0.8451544, + "learning_rate": 3.9644225369693895e-06, + "loss": 0.87589729, + "num_input_tokens_seen": 15650870, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.78320312, + "step": 735, + "time_per_iteration": 2.9672422409057617 + }, + { + "auxiliary_loss_clip": 0.01597811, + "auxiliary_loss_mlp": 0.01502458, + "balance_loss_clip": 1.22083056, + "balance_loss_mlp": 1.11679232, + "epoch": 0.08849876751037095, + "flos": 27267547006080.0, + "grad_norm": 2.8441519583303645, + "language_loss": 0.86698151, + "learning_rate": 3.964276113307735e-06, + "loss": 0.89798421, + "num_input_tokens_seen": 15670835, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.85546875, + "step": 736, + "time_per_iteration": 3.0749404430389404 + }, + { + "auxiliary_loss_clip": 0.01582332, + "auxiliary_loss_mlp": 0.01500224, + "balance_loss_clip": 1.20582509, + "balance_loss_mlp": 1.10959888, + "epoch": 0.08861901040101004, + "flos": 19830822497280.0, + "grad_norm": 2.7242547783916193, + "language_loss": 0.80779219, + "learning_rate": 3.9641293916669574e-06, + "loss": 0.8386178, + "num_input_tokens_seen": 15689795, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.90429688, + "step": 737, + "time_per_iteration": 2.971796989440918 + }, + { + "auxiliary_loss_clip": 0.01589547, + "auxiliary_loss_mlp": 0.01496208, + "balance_loss_clip": 1.21507072, + "balance_loss_mlp": 1.09299433, + "epoch": 0.08873925329164913, + "flos": 23660922204000.0, + "grad_norm": 2.1912139380154247, + "language_loss": 0.82500297, + "learning_rate": 3.9639823720693115e-06, + "loss": 0.85586053, + "num_input_tokens_seen": 15711650, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 4.03320312, + "step": 738, + "time_per_iteration": 3.089106798171997 + }, + { + "auxiliary_loss_clip": 0.01624016, + "auxiliary_loss_mlp": 0.0147718, + "balance_loss_clip": 1.24804187, + "balance_loss_mlp": 1.09952545, + "epoch": 0.08885949618228822, + "flos": 71839154916000.0, + "grad_norm": 0.8713739522987326, + "language_loss": 0.5988934, + "learning_rate": 3.963835054537102e-06, + "loss": 0.62990534, + "num_input_tokens_seen": 15780615, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.7734375, + "step": 739, + "time_per_iteration": 3.5472187995910645 + }, + { + "auxiliary_loss_clip": 0.01589113, + "auxiliary_loss_mlp": 0.01511999, + "balance_loss_clip": 1.21527278, + "balance_loss_mlp": 1.11622405, + "epoch": 0.08897973907292732, + "flos": 22348367183520.0, + "grad_norm": 4.044906858151719, + "language_loss": 0.60493231, + "learning_rate": 3.963687439092676e-06, + "loss": 0.63594335, + "num_input_tokens_seen": 15801300, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 3.95898438, + "step": 740, + "time_per_iteration": 2.9672040939331055 + }, + { + "auxiliary_loss_clip": 0.01583187, + "auxiliary_loss_mlp": 0.01487691, + "balance_loss_clip": 1.20815659, + "balance_loss_mlp": 1.09477735, + "epoch": 0.0890999819635664, + "flos": 21253977125280.0, + "grad_norm": 2.206049818910158, + "language_loss": 0.80725104, + "learning_rate": 3.963539525758427e-06, + "loss": 0.83795977, + "num_input_tokens_seen": 15820860, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.92578125, + "step": 741, + "time_per_iteration": 3.016366958618164 + }, + { + "auxiliary_loss_clip": 0.01586374, + "auxiliary_loss_mlp": 0.01482981, + "balance_loss_clip": 1.21274924, + "balance_loss_mlp": 1.09578967, + "epoch": 0.0892202248542055, + "flos": 25373054397600.0, + "grad_norm": 2.4693083308252084, + "language_loss": 0.67676628, + "learning_rate": 3.9633913145567925e-06, + "loss": 0.70745981, + "num_input_tokens_seen": 15841350, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 3.87109375, + "step": 742, + "time_per_iteration": 3.1129274368286133 + }, + { + "auxiliary_loss_clip": 0.01592933, + "auxiliary_loss_mlp": 0.01485367, + "balance_loss_clip": 1.21973586, + "balance_loss_mlp": 1.09874701, + "epoch": 0.08934046774484458, + "flos": 24459962765760.0, + "grad_norm": 1.9850607088657475, + "language_loss": 0.81487489, + "learning_rate": 3.9632428055102575e-06, + "loss": 0.84565794, + "num_input_tokens_seen": 15861360, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.86328125, + "step": 743, + "time_per_iteration": 3.04154896736145 + }, + { + "auxiliary_loss_clip": 0.0159269, + "auxiliary_loss_mlp": 0.01487668, + "balance_loss_clip": 1.21890998, + "balance_loss_mlp": 1.10753334, + "epoch": 0.08946071063548368, + "flos": 35775714672000.0, + "grad_norm": 3.1092329221018917, + "language_loss": 0.67272866, + "learning_rate": 3.9630939986413495e-06, + "loss": 0.70353222, + "num_input_tokens_seen": 15883160, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.796875, + "step": 744, + "time_per_iteration": 3.1119117736816406 + }, + { + "auxiliary_loss_clip": 0.01589826, + "auxiliary_loss_mlp": 0.01481579, + "balance_loss_clip": 1.21518779, + "balance_loss_mlp": 1.09267092, + "epoch": 0.08958095352612276, + "flos": 14358379140000.0, + "grad_norm": 2.494429652104513, + "language_loss": 0.78092337, + "learning_rate": 3.962944893972643e-06, + "loss": 0.81163752, + "num_input_tokens_seen": 15901610, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.88476562, + "step": 745, + "time_per_iteration": 2.9218428134918213 + }, + { + "auxiliary_loss_clip": 0.01593252, + "auxiliary_loss_mlp": 0.01479106, + "balance_loss_clip": 1.22055602, + "balance_loss_mlp": 1.09935308, + "epoch": 0.08970119641676186, + "flos": 17854935328800.0, + "grad_norm": 3.0469379400546526, + "language_loss": 0.9138903, + "learning_rate": 3.962795491526756e-06, + "loss": 0.94461381, + "num_input_tokens_seen": 15918770, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.79296875, + "step": 746, + "time_per_iteration": 2.961228132247925 + }, + { + "auxiliary_loss_clip": 0.01585119, + "auxiliary_loss_mlp": 0.0148385, + "balance_loss_clip": 1.21005988, + "balance_loss_mlp": 1.11039126, + "epoch": 0.08982143930740095, + "flos": 20813664744000.0, + "grad_norm": 2.455531559399907, + "language_loss": 0.89674735, + "learning_rate": 3.962645791326354e-06, + "loss": 0.92743707, + "num_input_tokens_seen": 15938025, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.72851562, + "step": 747, + "time_per_iteration": 3.01686429977417 + }, + { + "auxiliary_loss_clip": 0.01589296, + "auxiliary_loss_mlp": 0.01504019, + "balance_loss_clip": 1.21665645, + "balance_loss_mlp": 1.13246703, + "epoch": 0.08994168219804004, + "flos": 24100020884160.0, + "grad_norm": 4.562374889128983, + "language_loss": 0.83518308, + "learning_rate": 3.962495793394146e-06, + "loss": 0.86611617, + "num_input_tokens_seen": 15957215, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.7109375, + "step": 748, + "time_per_iteration": 2.9463322162628174 + }, + { + "auxiliary_loss_clip": 0.01602543, + "auxiliary_loss_mlp": 0.01416183, + "balance_loss_clip": 1.2298609, + "balance_loss_mlp": 1.08354187, + "epoch": 0.09006192508867913, + "flos": 57195022104000.0, + "grad_norm": 0.7781270228460038, + "language_loss": 0.61154282, + "learning_rate": 3.9623454977528864e-06, + "loss": 0.64173007, + "num_input_tokens_seen": 16015870, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.328125, + "step": 749, + "time_per_iteration": 3.296815872192383 + }, + { + "auxiliary_loss_clip": 0.01597551, + "auxiliary_loss_mlp": 0.0149696, + "balance_loss_clip": 1.22529626, + "balance_loss_mlp": 1.13017654, + "epoch": 0.09018216797931822, + "flos": 20489717050560.0, + "grad_norm": 1.9091799737993724, + "language_loss": 0.84992456, + "learning_rate": 3.962194904425375e-06, + "loss": 0.88086969, + "num_input_tokens_seen": 16036500, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.6640625, + "step": 750, + "time_per_iteration": 2.9233391284942627 + }, + { + "auxiliary_loss_clip": 0.0158593, + "auxiliary_loss_mlp": 0.01494404, + "balance_loss_clip": 1.21285295, + "balance_loss_mlp": 1.12189913, + "epoch": 0.09030241086995731, + "flos": 22640341073760.0, + "grad_norm": 2.327340211026155, + "language_loss": 0.68598843, + "learning_rate": 3.9620440134344566e-06, + "loss": 0.71679175, + "num_input_tokens_seen": 16054655, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.72070312, + "step": 751, + "time_per_iteration": 2.9603796005249023 + }, + { + "auxiliary_loss_clip": 0.01587205, + "auxiliary_loss_mlp": 0.01497242, + "balance_loss_clip": 1.21430039, + "balance_loss_mlp": 1.131603, + "epoch": 0.09042265376059641, + "flos": 21873653596800.0, + "grad_norm": 3.386739540005695, + "language_loss": 0.82535648, + "learning_rate": 3.9618928248030215e-06, + "loss": 0.85620093, + "num_input_tokens_seen": 16074165, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.65429688, + "step": 752, + "time_per_iteration": 3.0113041400909424 + }, + { + "auxiliary_loss_clip": 0.01586235, + "auxiliary_loss_mlp": 0.01516621, + "balance_loss_clip": 1.21427035, + "balance_loss_mlp": 1.15975618, + "epoch": 0.0905428966512355, + "flos": 24318489271680.0, + "grad_norm": 2.326468396125218, + "language_loss": 0.83450961, + "learning_rate": 3.961741338554005e-06, + "loss": 0.86553812, + "num_input_tokens_seen": 16092505, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.5703125, + "step": 753, + "time_per_iteration": 2.9160525798797607 + }, + { + "auxiliary_loss_clip": 0.01588178, + "auxiliary_loss_mlp": 0.01523087, + "balance_loss_clip": 1.21604407, + "balance_loss_mlp": 1.1606909, + "epoch": 0.09066313954187459, + "flos": 35847968545440.0, + "grad_norm": 1.9581446361664578, + "language_loss": 0.75930345, + "learning_rate": 3.9615895547103865e-06, + "loss": 0.79041612, + "num_input_tokens_seen": 16116150, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.62109375, + "step": 754, + "time_per_iteration": 3.063530445098877 + }, + { + "auxiliary_loss_clip": 0.01585282, + "auxiliary_loss_mlp": 0.01506563, + "balance_loss_clip": 1.21481514, + "balance_loss_mlp": 1.143785, + "epoch": 0.09078338243251367, + "flos": 29171294085600.0, + "grad_norm": 2.1303779174977393, + "language_loss": 0.77718759, + "learning_rate": 3.961437473295193e-06, + "loss": 0.80810606, + "num_input_tokens_seen": 16136295, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.625, + "step": 755, + "time_per_iteration": 2.9931538105010986 + }, + { + "auxiliary_loss_clip": 0.01584866, + "auxiliary_loss_mlp": 0.01499265, + "balance_loss_clip": 1.21346259, + "balance_loss_mlp": 1.13057411, + "epoch": 0.09090362532315277, + "flos": 21909837425760.0, + "grad_norm": 5.249619279565019, + "language_loss": 0.72132778, + "learning_rate": 3.961285094331495e-06, + "loss": 0.75216907, + "num_input_tokens_seen": 16154210, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.68164062, + "step": 756, + "time_per_iteration": 2.941368579864502 + }, + { + "auxiliary_loss_clip": 0.01576455, + "auxiliary_loss_mlp": 0.0149534, + "balance_loss_clip": 1.20544028, + "balance_loss_mlp": 1.1266495, + "epoch": 0.09102386821379185, + "flos": 27346969301760.0, + "grad_norm": 6.058613471782511, + "language_loss": 0.856359, + "learning_rate": 3.961132417842406e-06, + "loss": 0.88707691, + "num_input_tokens_seen": 16173995, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.68359375, + "step": 757, + "time_per_iteration": 3.7783432006835938 + }, + { + "auxiliary_loss_clip": 0.01587673, + "auxiliary_loss_mlp": 0.01501836, + "balance_loss_clip": 1.21768653, + "balance_loss_mlp": 1.1373421, + "epoch": 0.09114411110443095, + "flos": 20815599080160.0, + "grad_norm": 3.328075460275012, + "language_loss": 0.75337863, + "learning_rate": 3.960979443851089e-06, + "loss": 0.78427374, + "num_input_tokens_seen": 16191020, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.64648438, + "step": 758, + "time_per_iteration": 2.8829658031463623 + }, + { + "auxiliary_loss_clip": 0.01598427, + "auxiliary_loss_mlp": 0.01510361, + "balance_loss_clip": 1.22722185, + "balance_loss_mlp": 1.14224291, + "epoch": 0.09126435399507005, + "flos": 26148654992160.0, + "grad_norm": 2.3963594450798156, + "language_loss": 0.78845084, + "learning_rate": 3.96082617238075e-06, + "loss": 0.81953865, + "num_input_tokens_seen": 16213645, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.6796875, + "step": 759, + "time_per_iteration": 2.975959539413452 + }, + { + "auxiliary_loss_clip": 0.0158421, + "auxiliary_loss_mlp": 0.01471209, + "balance_loss_clip": 1.21403539, + "balance_loss_mlp": 1.09164691, + "epoch": 0.09138459688570913, + "flos": 24391198283040.0, + "grad_norm": 7.485638529870019, + "language_loss": 0.80091494, + "learning_rate": 3.960672603454639e-06, + "loss": 0.83146918, + "num_input_tokens_seen": 16233625, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.7890625, + "step": 760, + "time_per_iteration": 4.688950777053833 + }, + { + "auxiliary_loss_clip": 0.01586672, + "auxiliary_loss_mlp": 0.01500126, + "balance_loss_clip": 1.21471417, + "balance_loss_mlp": 1.12056386, + "epoch": 0.09150483977634823, + "flos": 21034788102720.0, + "grad_norm": 6.124156555866795, + "language_loss": 0.76753885, + "learning_rate": 3.960518737096054e-06, + "loss": 0.79840684, + "num_input_tokens_seen": 16253255, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.79296875, + "step": 761, + "time_per_iteration": 3.7180402278900146 + }, + { + "auxiliary_loss_clip": 0.0158686, + "auxiliary_loss_mlp": 0.01503684, + "balance_loss_clip": 1.21618462, + "balance_loss_mlp": 1.13155985, + "epoch": 0.09162508266698731, + "flos": 22859226671040.0, + "grad_norm": 2.465689880664268, + "language_loss": 0.73424786, + "learning_rate": 3.960364573328334e-06, + "loss": 0.76515329, + "num_input_tokens_seen": 16272580, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.71875, + "step": 762, + "time_per_iteration": 2.9332032203674316 + }, + { + "auxiliary_loss_clip": 0.0158351, + "auxiliary_loss_mlp": 0.01488312, + "balance_loss_clip": 1.21142185, + "balance_loss_mlp": 1.10932159, + "epoch": 0.0917453255576264, + "flos": 21726490878720.0, + "grad_norm": 3.854274665735693, + "language_loss": 0.88870478, + "learning_rate": 3.9602101121748675e-06, + "loss": 0.91942298, + "num_input_tokens_seen": 16293075, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.78515625, + "step": 763, + "time_per_iteration": 2.9894661903381348 + }, + { + "auxiliary_loss_clip": 0.01585083, + "auxiliary_loss_mlp": 0.01489015, + "balance_loss_clip": 1.21304452, + "balance_loss_mlp": 1.11441195, + "epoch": 0.0918655684482655, + "flos": 14610869451360.0, + "grad_norm": 2.2699940799540435, + "language_loss": 0.72696841, + "learning_rate": 3.960055353659085e-06, + "loss": 0.75770938, + "num_input_tokens_seen": 16310185, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.74609375, + "step": 764, + "time_per_iteration": 2.9506711959838867 + }, + { + "auxiliary_loss_clip": 0.01586141, + "auxiliary_loss_mlp": 0.01492159, + "balance_loss_clip": 1.21486473, + "balance_loss_mlp": 1.10992682, + "epoch": 0.09198581133890459, + "flos": 23436954233280.0, + "grad_norm": 2.219969159108541, + "language_loss": 0.83835822, + "learning_rate": 3.959900297804465e-06, + "loss": 0.86914122, + "num_input_tokens_seen": 16330355, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.81835938, + "step": 765, + "time_per_iteration": 3.109567880630493 + }, + { + "auxiliary_loss_clip": 0.01588343, + "auxiliary_loss_mlp": 0.01485651, + "balance_loss_clip": 1.21753621, + "balance_loss_mlp": 1.10437179, + "epoch": 0.09210605422954368, + "flos": 16797525590880.0, + "grad_norm": 4.295989026352588, + "language_loss": 0.77146852, + "learning_rate": 3.9597449446345276e-06, + "loss": 0.80220842, + "num_input_tokens_seen": 16347600, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.81054688, + "step": 766, + "time_per_iteration": 2.9434609413146973 + }, + { + "auxiliary_loss_clip": 0.01590117, + "auxiliary_loss_mlp": 0.01485492, + "balance_loss_clip": 1.21875989, + "balance_loss_mlp": 1.11203361, + "epoch": 0.09222629712018277, + "flos": 22676449046400.0, + "grad_norm": 2.209070431877779, + "language_loss": 0.84082091, + "learning_rate": 3.95958929417284e-06, + "loss": 0.87157702, + "num_input_tokens_seen": 16365755, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.73046875, + "step": 767, + "time_per_iteration": 2.902653932571411 + }, + { + "auxiliary_loss_clip": 0.01612141, + "auxiliary_loss_mlp": 0.01467709, + "balance_loss_clip": 1.24459529, + "balance_loss_mlp": 1.15719223, + "epoch": 0.09234654001082186, + "flos": 69984601024320.0, + "grad_norm": 0.7681934595426811, + "language_loss": 0.58746111, + "learning_rate": 3.9594333464430145e-06, + "loss": 0.61825967, + "num_input_tokens_seen": 16435245, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.1015625, + "step": 768, + "time_per_iteration": 3.5704827308654785 + }, + { + "auxiliary_loss_clip": 0.01589013, + "auxiliary_loss_mlp": 0.0148363, + "balance_loss_clip": 1.21822476, + "balance_loss_mlp": 1.11207831, + "epoch": 0.09246678290146094, + "flos": 20013524265600.0, + "grad_norm": 1.9396165465044495, + "language_loss": 0.88079429, + "learning_rate": 3.959277101468709e-06, + "loss": 0.91152078, + "num_input_tokens_seen": 16454795, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.71484375, + "step": 769, + "time_per_iteration": 2.8768858909606934 + }, + { + "auxiliary_loss_clip": 0.01598152, + "auxiliary_loss_mlp": 0.01497739, + "balance_loss_clip": 1.22703803, + "balance_loss_mlp": 1.13114679, + "epoch": 0.09258702579210004, + "flos": 17749304310240.0, + "grad_norm": 3.0316105110876244, + "language_loss": 0.78763115, + "learning_rate": 3.959120559273624e-06, + "loss": 0.81859004, + "num_input_tokens_seen": 16472580, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.6640625, + "step": 770, + "time_per_iteration": 2.8789408206939697 + }, + { + "auxiliary_loss_clip": 0.0159407, + "auxiliary_loss_mlp": 0.01487903, + "balance_loss_clip": 1.22255683, + "balance_loss_mlp": 1.10319102, + "epoch": 0.09270726868273914, + "flos": 20888649444960.0, + "grad_norm": 2.3292593276197118, + "language_loss": 0.83709586, + "learning_rate": 3.958963719881509e-06, + "loss": 0.86791557, + "num_input_tokens_seen": 16490670, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.84375, + "step": 771, + "time_per_iteration": 2.8944575786590576 + }, + { + "auxiliary_loss_clip": 0.0159569, + "auxiliary_loss_mlp": 0.01480563, + "balance_loss_clip": 1.22340763, + "balance_loss_mlp": 1.09680486, + "epoch": 0.09282751157337822, + "flos": 17017397320320.0, + "grad_norm": 2.1298970099397896, + "language_loss": 0.93867111, + "learning_rate": 3.958806583316154e-06, + "loss": 0.96943367, + "num_input_tokens_seen": 16508640, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.83398438, + "step": 772, + "time_per_iteration": 2.9190948009490967 + }, + { + "auxiliary_loss_clip": 0.01594162, + "auxiliary_loss_mlp": 0.01471201, + "balance_loss_clip": 1.22252178, + "balance_loss_mlp": 1.08515322, + "epoch": 0.09294775446401732, + "flos": 32526300924000.0, + "grad_norm": 2.2077643331487073, + "language_loss": 0.78733915, + "learning_rate": 3.9586491496013985e-06, + "loss": 0.81799281, + "num_input_tokens_seen": 16531035, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.85742188, + "step": 773, + "time_per_iteration": 3.082589626312256 + }, + { + "auxiliary_loss_clip": 0.01600049, + "auxiliary_loss_mlp": 0.01491595, + "balance_loss_clip": 1.22794342, + "balance_loss_mlp": 1.1131773, + "epoch": 0.0930679973546564, + "flos": 18261718852320.0, + "grad_norm": 6.979786422272111, + "language_loss": 0.83090055, + "learning_rate": 3.958491418761124e-06, + "loss": 0.861817, + "num_input_tokens_seen": 16548605, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.77929688, + "step": 774, + "time_per_iteration": 2.8605053424835205 + }, + { + "auxiliary_loss_clip": 0.01591728, + "auxiliary_loss_mlp": 0.01498456, + "balance_loss_clip": 1.21978498, + "balance_loss_mlp": 1.1064961, + "epoch": 0.0931882402452955, + "flos": 21101542392960.0, + "grad_norm": 2.5339501658738244, + "language_loss": 0.72452766, + "learning_rate": 3.958333390819258e-06, + "loss": 0.75542951, + "num_input_tokens_seen": 16565535, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.91796875, + "step": 775, + "time_per_iteration": 2.9734857082366943 + }, + { + "auxiliary_loss_clip": 0.01592496, + "auxiliary_loss_mlp": 0.01482287, + "balance_loss_clip": 1.22108984, + "balance_loss_mlp": 1.09395099, + "epoch": 0.0933084831359346, + "flos": 24209596431360.0, + "grad_norm": 2.5738025886828275, + "language_loss": 0.80395067, + "learning_rate": 3.9581750657997754e-06, + "loss": 0.83469844, + "num_input_tokens_seen": 16584900, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.8828125, + "step": 776, + "time_per_iteration": 3.064129114151001 + }, + { + "auxiliary_loss_clip": 0.01588996, + "auxiliary_loss_mlp": 0.0147383, + "balance_loss_clip": 1.21639156, + "balance_loss_mlp": 1.08091581, + "epoch": 0.09342872602657368, + "flos": 25482440304000.0, + "grad_norm": 2.0133589036927577, + "language_loss": 0.89760673, + "learning_rate": 3.95801644372669e-06, + "loss": 0.92823505, + "num_input_tokens_seen": 16604805, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.9296875, + "step": 777, + "time_per_iteration": 2.967244863510132 + }, + { + "auxiliary_loss_clip": 0.01591104, + "auxiliary_loss_mlp": 0.01486727, + "balance_loss_clip": 1.21856761, + "balance_loss_mlp": 1.10888135, + "epoch": 0.09354896891721277, + "flos": 23151314345760.0, + "grad_norm": 2.1672316648178516, + "language_loss": 0.8505379, + "learning_rate": 3.957857524624068e-06, + "loss": 0.88131618, + "num_input_tokens_seen": 16623685, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.7734375, + "step": 778, + "time_per_iteration": 2.9956116676330566 + }, + { + "auxiliary_loss_clip": 0.01592413, + "auxiliary_loss_mlp": 0.01488971, + "balance_loss_clip": 1.22005224, + "balance_loss_mlp": 1.1122694, + "epoch": 0.09366921180785186, + "flos": 24281850304800.0, + "grad_norm": 2.2614081623174376, + "language_loss": 0.89835292, + "learning_rate": 3.957698308516016e-06, + "loss": 0.92916679, + "num_input_tokens_seen": 16644985, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.76367188, + "step": 779, + "time_per_iteration": 2.916806221008301 + }, + { + "auxiliary_loss_clip": 0.01604696, + "auxiliary_loss_mlp": 0.01491663, + "balance_loss_clip": 1.23193669, + "balance_loss_mlp": 1.11782229, + "epoch": 0.09378945469849095, + "flos": 18731805203520.0, + "grad_norm": 1.9923750797727602, + "language_loss": 0.82523859, + "learning_rate": 3.957538795426688e-06, + "loss": 0.85620219, + "num_input_tokens_seen": 16662410, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.73242188, + "step": 780, + "time_per_iteration": 2.971165895462036 + }, + { + "auxiliary_loss_clip": 0.01595357, + "auxiliary_loss_mlp": 0.01498439, + "balance_loss_clip": 1.22322655, + "balance_loss_mlp": 1.12574339, + "epoch": 0.09390969758913004, + "flos": 23220913248000.0, + "grad_norm": 2.7503556149082713, + "language_loss": 0.76814628, + "learning_rate": 3.9573789853802804e-06, + "loss": 0.79908431, + "num_input_tokens_seen": 16680885, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.72265625, + "step": 781, + "time_per_iteration": 2.9587767124176025 + }, + { + "auxiliary_loss_clip": 0.0160058, + "auxiliary_loss_mlp": 0.01511406, + "balance_loss_clip": 1.22695875, + "balance_loss_mlp": 1.13851893, + "epoch": 0.09402994047976913, + "flos": 19648803435840.0, + "grad_norm": 2.264601944791406, + "language_loss": 0.74960721, + "learning_rate": 3.957218878401037e-06, + "loss": 0.78072703, + "num_input_tokens_seen": 16699375, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.72460938, + "step": 782, + "time_per_iteration": 2.9361069202423096 + }, + { + "auxiliary_loss_clip": 0.01595565, + "auxiliary_loss_mlp": 0.01516156, + "balance_loss_clip": 1.22260022, + "balance_loss_mlp": 1.14937258, + "epoch": 0.09415018337040823, + "flos": 29422836192960.0, + "grad_norm": 2.427780345328594, + "language_loss": 0.89363492, + "learning_rate": 3.957058474513246e-06, + "loss": 0.92475218, + "num_input_tokens_seen": 16719230, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.66601562, + "step": 783, + "time_per_iteration": 2.9515163898468018 + }, + { + "auxiliary_loss_clip": 0.01600269, + "auxiliary_loss_mlp": 0.01514921, + "balance_loss_clip": 1.22673988, + "balance_loss_mlp": 1.1418438, + "epoch": 0.09427042626104731, + "flos": 24574431045600.0, + "grad_norm": 2.1774938782140336, + "language_loss": 0.78376603, + "learning_rate": 3.956897773741241e-06, + "loss": 0.81491792, + "num_input_tokens_seen": 16738220, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.72851562, + "step": 784, + "time_per_iteration": 2.9206647872924805 + }, + { + "auxiliary_loss_clip": 0.01593332, + "auxiliary_loss_mlp": 0.01503606, + "balance_loss_clip": 1.21825528, + "balance_loss_mlp": 1.13663173, + "epoch": 0.09439066915168641, + "flos": 26362003078080.0, + "grad_norm": 11.440429617607325, + "language_loss": 0.71748984, + "learning_rate": 3.956736776109398e-06, + "loss": 0.74845922, + "num_input_tokens_seen": 16759395, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.66601562, + "step": 785, + "time_per_iteration": 3.7778406143188477 + }, + { + "auxiliary_loss_clip": 0.01596515, + "auxiliary_loss_mlp": 0.0149649, + "balance_loss_clip": 1.2230736, + "balance_loss_mlp": 1.13161373, + "epoch": 0.09451091204232549, + "flos": 19429310988000.0, + "grad_norm": 3.225968648232187, + "language_loss": 0.83691072, + "learning_rate": 3.956575481642143e-06, + "loss": 0.86784077, + "num_input_tokens_seen": 16778285, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.6484375, + "step": 786, + "time_per_iteration": 2.950855255126953 + }, + { + "auxiliary_loss_clip": 0.01593472, + "auxiliary_loss_mlp": 0.01492395, + "balance_loss_clip": 1.21921515, + "balance_loss_mlp": 1.12561202, + "epoch": 0.09463115493296459, + "flos": 25370058072960.0, + "grad_norm": 2.5452840054508377, + "language_loss": 0.75023615, + "learning_rate": 3.956413890363943e-06, + "loss": 0.78109485, + "num_input_tokens_seen": 16795265, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.66796875, + "step": 787, + "time_per_iteration": 3.8202004432678223 + }, + { + "auxiliary_loss_clip": 0.01601272, + "auxiliary_loss_mlp": 0.01511233, + "balance_loss_clip": 1.22737169, + "balance_loss_mlp": 1.14635682, + "epoch": 0.09475139782360369, + "flos": 10124757731520.0, + "grad_norm": 2.1471907356310793, + "language_loss": 0.81696409, + "learning_rate": 3.956252002299312e-06, + "loss": 0.8480891, + "num_input_tokens_seen": 16811165, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 3.64453125, + "step": 788, + "time_per_iteration": 3.76133131980896 + }, + { + "auxiliary_loss_clip": 0.01597837, + "auxiliary_loss_mlp": 0.01549578, + "balance_loss_clip": 1.22358704, + "balance_loss_mlp": 1.19214046, + "epoch": 0.09487164071424277, + "flos": 17232603886080.0, + "grad_norm": 3.094012013852655, + "language_loss": 0.91231191, + "learning_rate": 3.956089817472807e-06, + "loss": 0.94378614, + "num_input_tokens_seen": 16828470, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.57421875, + "step": 789, + "time_per_iteration": 3.7711398601531982 + }, + { + "auxiliary_loss_clip": 0.01605934, + "auxiliary_loss_mlp": 0.01490125, + "balance_loss_clip": 1.23062098, + "balance_loss_mlp": 1.12486827, + "epoch": 0.09499188360488187, + "flos": 30852362751840.0, + "grad_norm": 2.8900446044144155, + "language_loss": 0.85863125, + "learning_rate": 3.955927335909032e-06, + "loss": 0.88959181, + "num_input_tokens_seen": 16851680, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.65039062, + "step": 790, + "time_per_iteration": 3.03147029876709 + }, + { + "auxiliary_loss_clip": 0.01601119, + "auxiliary_loss_mlp": 0.01492806, + "balance_loss_clip": 1.2276175, + "balance_loss_mlp": 1.12754869, + "epoch": 0.09511212649552095, + "flos": 29354299279200.0, + "grad_norm": 3.2688134394462325, + "language_loss": 0.76289177, + "learning_rate": 3.955764557632634e-06, + "loss": 0.79383099, + "num_input_tokens_seen": 16871490, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.65039062, + "step": 791, + "time_per_iteration": 3.0124855041503906 + }, + { + "auxiliary_loss_clip": 0.01602905, + "auxiliary_loss_mlp": 0.01494509, + "balance_loss_clip": 1.22813201, + "balance_loss_mlp": 1.12715364, + "epoch": 0.09523236938616005, + "flos": 10380206439360.0, + "grad_norm": 3.9753575906961776, + "language_loss": 0.94844592, + "learning_rate": 3.955601482668309e-06, + "loss": 0.97942007, + "num_input_tokens_seen": 16889350, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.66992188, + "step": 792, + "time_per_iteration": 2.9208335876464844 + }, + { + "auxiliary_loss_clip": 0.01595556, + "auxiliary_loss_mlp": 0.01481937, + "balance_loss_clip": 1.22118926, + "balance_loss_mlp": 1.1082871, + "epoch": 0.09535261227679913, + "flos": 19063793666880.0, + "grad_norm": 3.2670093329805017, + "language_loss": 0.88604558, + "learning_rate": 3.955438111040794e-06, + "loss": 0.91682053, + "num_input_tokens_seen": 16907625, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.734375, + "step": 793, + "time_per_iteration": 2.8681585788726807 + }, + { + "auxiliary_loss_clip": 0.01599203, + "auxiliary_loss_mlp": 0.01496441, + "balance_loss_clip": 1.22644377, + "balance_loss_mlp": 1.1252712, + "epoch": 0.09547285516743823, + "flos": 20925098771040.0, + "grad_norm": 3.2799636136202275, + "language_loss": 0.80034649, + "learning_rate": 3.955274442774873e-06, + "loss": 0.83130288, + "num_input_tokens_seen": 16926205, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.7109375, + "step": 794, + "time_per_iteration": 2.9206666946411133 + }, + { + "auxiliary_loss_clip": 0.01602154, + "auxiliary_loss_mlp": 0.01507722, + "balance_loss_clip": 1.22683382, + "balance_loss_mlp": 1.13960361, + "epoch": 0.09559309805807732, + "flos": 30157056800640.0, + "grad_norm": 5.480688389778644, + "language_loss": 0.71120107, + "learning_rate": 3.9551104778953725e-06, + "loss": 0.7422998, + "num_input_tokens_seen": 16946500, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.6796875, + "step": 795, + "time_per_iteration": 2.988307237625122 + }, + { + "auxiliary_loss_clip": 0.01598268, + "auxiliary_loss_mlp": 0.01492587, + "balance_loss_clip": 1.22455096, + "balance_loss_mlp": 1.11741138, + "epoch": 0.0957133409487164, + "flos": 21068734170240.0, + "grad_norm": 4.633869416693014, + "language_loss": 0.8540048, + "learning_rate": 3.954946216427167e-06, + "loss": 0.88491338, + "num_input_tokens_seen": 16966960, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.75, + "step": 796, + "time_per_iteration": 2.923560380935669 + }, + { + "auxiliary_loss_clip": 0.017008, + "auxiliary_loss_mlp": 0.01486992, + "balance_loss_clip": 1.32916319, + "balance_loss_mlp": 1.13832855, + "epoch": 0.0958335838393555, + "flos": 71304286903200.0, + "grad_norm": 0.9068570395263417, + "language_loss": 0.6160183, + "learning_rate": 3.954781658395176e-06, + "loss": 0.64789629, + "num_input_tokens_seen": 17023215, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.4921875, + "step": 797, + "time_per_iteration": 3.3950917720794678 + }, + { + "auxiliary_loss_clip": 0.0160428, + "auxiliary_loss_mlp": 0.01493505, + "balance_loss_clip": 1.23152041, + "balance_loss_mlp": 1.11584949, + "epoch": 0.09595382672999458, + "flos": 21875208651360.0, + "grad_norm": 1.9443605855286201, + "language_loss": 0.92081177, + "learning_rate": 3.95461680382436e-06, + "loss": 0.95178956, + "num_input_tokens_seen": 17042140, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.77148438, + "step": 798, + "time_per_iteration": 2.8940649032592773 + }, + { + "auxiliary_loss_clip": 0.01605088, + "auxiliary_loss_mlp": 0.01489358, + "balance_loss_clip": 1.23113859, + "balance_loss_mlp": 1.11799693, + "epoch": 0.09607406962063368, + "flos": 18697441926240.0, + "grad_norm": 3.230803302984367, + "language_loss": 0.86457419, + "learning_rate": 3.9544516527397295e-06, + "loss": 0.89551866, + "num_input_tokens_seen": 17058490, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.71289062, + "step": 799, + "time_per_iteration": 2.980281352996826 + }, + { + "auxiliary_loss_clip": 0.01604998, + "auxiliary_loss_mlp": 0.01482138, + "balance_loss_clip": 1.23372912, + "balance_loss_mlp": 1.10677147, + "epoch": 0.09619431251127276, + "flos": 22570855956000.0, + "grad_norm": 2.41143893788548, + "language_loss": 0.80453753, + "learning_rate": 3.954286205166338e-06, + "loss": 0.83540893, + "num_input_tokens_seen": 17079655, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.75, + "step": 800, + "time_per_iteration": 2.9834773540496826 + }, + { + "auxiliary_loss_clip": 0.01606981, + "auxiliary_loss_mlp": 0.01490605, + "balance_loss_clip": 1.23426747, + "balance_loss_mlp": 1.12019789, + "epoch": 0.09631455540191186, + "flos": 14247969173280.0, + "grad_norm": 3.8490870807842836, + "language_loss": 0.83806872, + "learning_rate": 3.954120461129282e-06, + "loss": 0.86904454, + "num_input_tokens_seen": 17097065, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.69921875, + "step": 801, + "time_per_iteration": 2.8821518421173096 + }, + { + "auxiliary_loss_clip": 0.01615755, + "auxiliary_loss_mlp": 0.01503662, + "balance_loss_clip": 1.24419594, + "balance_loss_mlp": 1.13211024, + "epoch": 0.09643479829255096, + "flos": 20742472859040.0, + "grad_norm": 3.798450742080125, + "language_loss": 0.84175408, + "learning_rate": 3.953954420653706e-06, + "loss": 0.87294823, + "num_input_tokens_seen": 17114090, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.71289062, + "step": 802, + "time_per_iteration": 2.9721837043762207 + }, + { + "auxiliary_loss_clip": 0.01614548, + "auxiliary_loss_mlp": 0.01508145, + "balance_loss_clip": 1.24331498, + "balance_loss_mlp": 1.14212489, + "epoch": 0.09655504118319004, + "flos": 24422489379360.0, + "grad_norm": 2.2275893731652743, + "language_loss": 0.88105166, + "learning_rate": 3.953788083764798e-06, + "loss": 0.91227859, + "num_input_tokens_seen": 17133325, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.65625, + "step": 803, + "time_per_iteration": 2.986170530319214 + }, + { + "auxiliary_loss_clip": 0.01608032, + "auxiliary_loss_mlp": 0.01509619, + "balance_loss_clip": 1.2373637, + "balance_loss_mlp": 1.14359891, + "epoch": 0.09667528407382914, + "flos": 18443927554560.0, + "grad_norm": 2.512614282819921, + "language_loss": 0.9232052, + "learning_rate": 3.953621450487792e-06, + "loss": 0.9543817, + "num_input_tokens_seen": 17151945, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 3.65625, + "step": 804, + "time_per_iteration": 2.9976649284362793 + }, + { + "auxiliary_loss_clip": 0.01685281, + "auxiliary_loss_mlp": 0.01437233, + "balance_loss_clip": 1.31601775, + "balance_loss_mlp": 1.10306549, + "epoch": 0.09679552696446822, + "flos": 70824376794240.0, + "grad_norm": 0.8479131810675808, + "language_loss": 0.61176938, + "learning_rate": 3.953454520847964e-06, + "loss": 0.64299446, + "num_input_tokens_seen": 17216790, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.34375, + "step": 805, + "time_per_iteration": 3.5578489303588867 + }, + { + "auxiliary_loss_clip": 0.01608382, + "auxiliary_loss_mlp": 0.01494083, + "balance_loss_clip": 1.23736405, + "balance_loss_mlp": 1.12176824, + "epoch": 0.09691576985510732, + "flos": 21947728021920.0, + "grad_norm": 2.2183065586336523, + "language_loss": 0.73684955, + "learning_rate": 3.9532872948706395e-06, + "loss": 0.76787424, + "num_input_tokens_seen": 17236285, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.71875, + "step": 806, + "time_per_iteration": 2.9158554077148438 + }, + { + "auxiliary_loss_clip": 0.01607997, + "auxiliary_loss_mlp": 0.0149416, + "balance_loss_clip": 1.23623347, + "balance_loss_mlp": 1.12604213, + "epoch": 0.09703601274574641, + "flos": 17967014134560.0, + "grad_norm": 3.3441342412487205, + "language_loss": 0.83111906, + "learning_rate": 3.9531197725811845e-06, + "loss": 0.86214066, + "num_input_tokens_seen": 17251670, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.67773438, + "step": 807, + "time_per_iteration": 2.962414026260376 + }, + { + "auxiliary_loss_clip": 0.01610402, + "auxiliary_loss_mlp": 0.01486368, + "balance_loss_clip": 1.23890316, + "balance_loss_mlp": 1.10928535, + "epoch": 0.0971562556363855, + "flos": 22164262073280.0, + "grad_norm": 1.9022236748267083, + "language_loss": 0.88175726, + "learning_rate": 3.952951954005013e-06, + "loss": 0.91272497, + "num_input_tokens_seen": 17271355, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.765625, + "step": 808, + "time_per_iteration": 3.0857999324798584 + }, + { + "auxiliary_loss_clip": 0.01603178, + "auxiliary_loss_mlp": 0.01484211, + "balance_loss_clip": 1.23257291, + "balance_loss_mlp": 1.10674632, + "epoch": 0.0972764985270246, + "flos": 25851105662400.0, + "grad_norm": 1.8086840752562425, + "language_loss": 0.84608191, + "learning_rate": 3.952783839167584e-06, + "loss": 0.87695581, + "num_input_tokens_seen": 17291400, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.77148438, + "step": 809, + "time_per_iteration": 3.007763624191284 + }, + { + "auxiliary_loss_clip": 0.01610641, + "auxiliary_loss_mlp": 0.0148876, + "balance_loss_clip": 1.24044561, + "balance_loss_mlp": 1.11396623, + "epoch": 0.09739674141766368, + "flos": 20341454415840.0, + "grad_norm": 11.485757608863524, + "language_loss": 0.74471736, + "learning_rate": 3.952615428094398e-06, + "loss": 0.77571142, + "num_input_tokens_seen": 17310920, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 3.7421875, + "step": 810, + "time_per_iteration": 2.934685468673706 + }, + { + "auxiliary_loss_clip": 0.01606851, + "auxiliary_loss_mlp": 0.01511566, + "balance_loss_clip": 1.2373445, + "balance_loss_mlp": 1.14573669, + "epoch": 0.09751698430830277, + "flos": 15744932729280.0, + "grad_norm": 6.66471875132281, + "language_loss": 0.73600829, + "learning_rate": 3.952446720811004e-06, + "loss": 0.76719242, + "num_input_tokens_seen": 17329245, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.65625, + "step": 811, + "time_per_iteration": 2.9941747188568115 + }, + { + "auxiliary_loss_clip": 0.01661079, + "auxiliary_loss_mlp": 0.01449509, + "balance_loss_clip": 1.29426003, + "balance_loss_mlp": 1.12831116, + "epoch": 0.09763722719894186, + "flos": 63723130503840.0, + "grad_norm": 0.854144660070282, + "language_loss": 0.63527924, + "learning_rate": 3.952277717342995e-06, + "loss": 0.66638511, + "num_input_tokens_seen": 17395680, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.2109375, + "step": 812, + "time_per_iteration": 4.35107159614563 + }, + { + "auxiliary_loss_clip": 0.01606395, + "auxiliary_loss_mlp": 0.01498696, + "balance_loss_clip": 1.235116, + "balance_loss_mlp": 1.13649011, + "epoch": 0.09775747008958095, + "flos": 22093108116480.0, + "grad_norm": 3.821897817372128, + "language_loss": 0.85672617, + "learning_rate": 3.952108417716009e-06, + "loss": 0.88777709, + "num_input_tokens_seen": 17415135, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.62304688, + "step": 813, + "time_per_iteration": 2.9508917331695557 + }, + { + "auxiliary_loss_clip": 0.01615352, + "auxiliary_loss_mlp": 0.01491504, + "balance_loss_clip": 1.2437048, + "balance_loss_mlp": 1.10259557, + "epoch": 0.09787771298022005, + "flos": 21288074905440.0, + "grad_norm": 2.109956546837403, + "language_loss": 0.84988964, + "learning_rate": 3.951938821955727e-06, + "loss": 0.8809582, + "num_input_tokens_seen": 17434535, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.88867188, + "step": 814, + "time_per_iteration": 2.9978699684143066 + }, + { + "auxiliary_loss_clip": 0.01608353, + "auxiliary_loss_mlp": 0.01493698, + "balance_loss_clip": 1.23760343, + "balance_loss_mlp": 1.12519801, + "epoch": 0.09799795587085913, + "flos": 22056772574880.0, + "grad_norm": 2.311859524126438, + "language_loss": 0.76809114, + "learning_rate": 3.9517689300878786e-06, + "loss": 0.7991116, + "num_input_tokens_seen": 17454270, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 3.68554688, + "step": 815, + "time_per_iteration": 3.8230066299438477 + }, + { + "auxiliary_loss_clip": 0.01600782, + "auxiliary_loss_mlp": 0.01488268, + "balance_loss_clip": 1.23013484, + "balance_loss_mlp": 1.11862433, + "epoch": 0.09811819876149823, + "flos": 22165930912320.0, + "grad_norm": 3.406480460963653, + "language_loss": 0.78814775, + "learning_rate": 3.951598742138236e-06, + "loss": 0.81903827, + "num_input_tokens_seen": 17472995, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.6953125, + "step": 816, + "time_per_iteration": 4.664109468460083 + }, + { + "auxiliary_loss_clip": 0.01600632, + "auxiliary_loss_mlp": 0.01489123, + "balance_loss_clip": 1.23119593, + "balance_loss_mlp": 1.11471057, + "epoch": 0.09823844165213731, + "flos": 22232988627840.0, + "grad_norm": 2.5888538363778673, + "language_loss": 0.79703808, + "learning_rate": 3.951428258132615e-06, + "loss": 0.82793564, + "num_input_tokens_seen": 17491115, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.7421875, + "step": 817, + "time_per_iteration": 3.228795289993286 + }, + { + "auxiliary_loss_clip": 0.01609624, + "auxiliary_loss_mlp": 0.01498808, + "balance_loss_clip": 1.24168992, + "balance_loss_mlp": 1.12401354, + "epoch": 0.09835868454277641, + "flos": 22489688964960.0, + "grad_norm": 2.7709139803542064, + "language_loss": 0.84863687, + "learning_rate": 3.951257478096879e-06, + "loss": 0.87972116, + "num_input_tokens_seen": 17509480, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.74609375, + "step": 818, + "time_per_iteration": 2.9861228466033936 + }, + { + "auxiliary_loss_clip": 0.01599297, + "auxiliary_loss_mlp": 0.01488439, + "balance_loss_clip": 1.23014414, + "balance_loss_mlp": 1.11745989, + "epoch": 0.0984789274334155, + "flos": 16364419560000.0, + "grad_norm": 9.41818292854036, + "language_loss": 0.68559462, + "learning_rate": 3.951086402056936e-06, + "loss": 0.71647197, + "num_input_tokens_seen": 17524080, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.70507812, + "step": 819, + "time_per_iteration": 2.9824936389923096 + }, + { + "auxiliary_loss_clip": 0.0161365, + "auxiliary_loss_mlp": 0.01494235, + "balance_loss_clip": 1.24327111, + "balance_loss_mlp": 1.12039447, + "epoch": 0.09859917032405459, + "flos": 24245856116640.0, + "grad_norm": 2.3717340639254187, + "language_loss": 0.83607209, + "learning_rate": 3.950915030038735e-06, + "loss": 0.86715096, + "num_input_tokens_seen": 17543875, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.73632812, + "step": 820, + "time_per_iteration": 3.0001862049102783 + }, + { + "auxiliary_loss_clip": 0.01605204, + "auxiliary_loss_mlp": 0.01481913, + "balance_loss_clip": 1.23537242, + "balance_loss_mlp": 1.10444868, + "epoch": 0.09871941321469369, + "flos": 17422094795040.0, + "grad_norm": 4.106611059650996, + "language_loss": 0.8402952, + "learning_rate": 3.9507433620682765e-06, + "loss": 0.87116647, + "num_input_tokens_seen": 17560810, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.77148438, + "step": 821, + "time_per_iteration": 3.188257932662964 + }, + { + "auxiliary_loss_clip": 0.01589447, + "auxiliary_loss_mlp": 0.01481582, + "balance_loss_clip": 1.21808243, + "balance_loss_mlp": 1.10526204, + "epoch": 0.09883965610533277, + "flos": 28479970591200.0, + "grad_norm": 1.7965027277190722, + "language_loss": 0.88463104, + "learning_rate": 3.9505713981716e-06, + "loss": 0.91534132, + "num_input_tokens_seen": 17583640, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.75976562, + "step": 822, + "time_per_iteration": 3.011378288269043 + }, + { + "auxiliary_loss_clip": 0.01597917, + "auxiliary_loss_mlp": 0.01492191, + "balance_loss_clip": 1.22705412, + "balance_loss_mlp": 1.12330937, + "epoch": 0.09895989899597187, + "flos": 23696081972640.0, + "grad_norm": 2.961098985213273, + "language_loss": 0.81463623, + "learning_rate": 3.950399138374795e-06, + "loss": 0.84553725, + "num_input_tokens_seen": 17602720, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 3.68554688, + "step": 823, + "time_per_iteration": 2.933398962020874 + }, + { + "auxiliary_loss_clip": 0.01595226, + "auxiliary_loss_mlp": 0.01481828, + "balance_loss_clip": 1.22394276, + "balance_loss_mlp": 1.11237454, + "epoch": 0.09908014188661095, + "flos": 24681844687680.0, + "grad_norm": 3.3519926424487356, + "language_loss": 0.74252045, + "learning_rate": 3.95022658270399e-06, + "loss": 0.77329099, + "num_input_tokens_seen": 17623085, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.69140625, + "step": 824, + "time_per_iteration": 2.9757204055786133 + }, + { + "auxiliary_loss_clip": 0.01600352, + "auxiliary_loss_mlp": 0.01491524, + "balance_loss_clip": 1.22795606, + "balance_loss_mlp": 1.12016273, + "epoch": 0.09920038477725004, + "flos": 14066063896320.0, + "grad_norm": 2.296640405182843, + "language_loss": 0.78216195, + "learning_rate": 3.9500537311853635e-06, + "loss": 0.81308073, + "num_input_tokens_seen": 17641040, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.7109375, + "step": 825, + "time_per_iteration": 2.903573989868164 + }, + { + "auxiliary_loss_clip": 0.0159973, + "auxiliary_loss_mlp": 0.01486602, + "balance_loss_clip": 1.22987401, + "balance_loss_mlp": 1.1089468, + "epoch": 0.09932062766788914, + "flos": 13408762325760.0, + "grad_norm": 4.25079973798522, + "language_loss": 0.83351773, + "learning_rate": 3.949880583845136e-06, + "loss": 0.86438107, + "num_input_tokens_seen": 17659115, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.7734375, + "step": 826, + "time_per_iteration": 2.9537856578826904 + }, + { + "auxiliary_loss_clip": 0.01595834, + "auxiliary_loss_mlp": 0.01493604, + "balance_loss_clip": 1.22509098, + "balance_loss_mlp": 1.11652076, + "epoch": 0.09944087055852822, + "flos": 19502816490720.0, + "grad_norm": 1.8020207073131258, + "language_loss": 0.81058002, + "learning_rate": 3.949707140709575e-06, + "loss": 0.84147435, + "num_input_tokens_seen": 17678845, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.76757812, + "step": 827, + "time_per_iteration": 2.955293893814087 + }, + { + "auxiliary_loss_clip": 0.01600863, + "auxiliary_loss_mlp": 0.01476405, + "balance_loss_clip": 1.23040986, + "balance_loss_mlp": 1.1031369, + "epoch": 0.09956111344916732, + "flos": 17751049005600.0, + "grad_norm": 3.573271136617422, + "language_loss": 0.83497667, + "learning_rate": 3.949533401804991e-06, + "loss": 0.86574936, + "num_input_tokens_seen": 17695750, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.73046875, + "step": 828, + "time_per_iteration": 2.9340837001800537 + }, + { + "auxiliary_loss_clip": 0.01593356, + "auxiliary_loss_mlp": 0.01497396, + "balance_loss_clip": 1.2224859, + "balance_loss_mlp": 1.1237464, + "epoch": 0.0996813563398064, + "flos": 17969517393120.0, + "grad_norm": 2.237598563077506, + "language_loss": 0.90533584, + "learning_rate": 3.949359367157739e-06, + "loss": 0.93624336, + "num_input_tokens_seen": 17714445, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.73242188, + "step": 829, + "time_per_iteration": 2.961787223815918 + }, + { + "auxiliary_loss_clip": 0.01597125, + "auxiliary_loss_mlp": 0.01494761, + "balance_loss_clip": 1.22706413, + "balance_loss_mlp": 1.12053955, + "epoch": 0.0998015992304455, + "flos": 17458961330880.0, + "grad_norm": 2.227771360593873, + "language_loss": 0.75739348, + "learning_rate": 3.949185036794222e-06, + "loss": 0.78831238, + "num_input_tokens_seen": 17732455, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.73828125, + "step": 830, + "time_per_iteration": 2.9825687408447266 + }, + { + "auxiliary_loss_clip": 0.01601769, + "auxiliary_loss_mlp": 0.01504416, + "balance_loss_clip": 1.23154521, + "balance_loss_mlp": 1.13362777, + "epoch": 0.0999218421210846, + "flos": 25891347804480.0, + "grad_norm": 4.475974473024448, + "language_loss": 0.79047364, + "learning_rate": 3.949010410740884e-06, + "loss": 0.82153547, + "num_input_tokens_seen": 17755280, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.70507812, + "step": 831, + "time_per_iteration": 3.086723804473877 + }, + { + "auxiliary_loss_clip": 0.01590391, + "auxiliary_loss_mlp": 0.01486064, + "balance_loss_clip": 1.21752143, + "balance_loss_mlp": 1.11546612, + "epoch": 0.10004208501172368, + "flos": 21218096721600.0, + "grad_norm": 1.963700390749524, + "language_loss": 0.86295819, + "learning_rate": 3.948835489024216e-06, + "loss": 0.89372271, + "num_input_tokens_seen": 17775015, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 3.70507812, + "step": 832, + "time_per_iteration": 3.0150978565216064 + }, + { + "auxiliary_loss_clip": 0.01603825, + "auxiliary_loss_mlp": 0.01494465, + "balance_loss_clip": 1.23359501, + "balance_loss_mlp": 1.12005258, + "epoch": 0.10016232790236278, + "flos": 17350371915840.0, + "grad_norm": 2.504708824373895, + "language_loss": 0.90172625, + "learning_rate": 3.948660271670755e-06, + "loss": 0.9327091, + "num_input_tokens_seen": 17792165, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.74023438, + "step": 833, + "time_per_iteration": 3.038119316101074 + }, + { + "auxiliary_loss_clip": 0.01597056, + "auxiliary_loss_mlp": 0.01495681, + "balance_loss_clip": 1.22490704, + "balance_loss_mlp": 1.12470138, + "epoch": 0.10028257079300186, + "flos": 25668934888320.0, + "grad_norm": 2.3322798516846897, + "language_loss": 0.84290719, + "learning_rate": 3.948484758707079e-06, + "loss": 0.87383461, + "num_input_tokens_seen": 17811765, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.70703125, + "step": 834, + "time_per_iteration": 3.045759677886963 + }, + { + "auxiliary_loss_clip": 0.01599749, + "auxiliary_loss_mlp": 0.01492859, + "balance_loss_clip": 1.22893214, + "balance_loss_mlp": 1.11730218, + "epoch": 0.10040281368364096, + "flos": 25158416754240.0, + "grad_norm": 6.840156706595795, + "language_loss": 0.83353192, + "learning_rate": 3.948308950159815e-06, + "loss": 0.86445802, + "num_input_tokens_seen": 17830445, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.75195312, + "step": 835, + "time_per_iteration": 2.9461164474487305 + }, + { + "auxiliary_loss_clip": 0.0159565, + "auxiliary_loss_mlp": 0.01493605, + "balance_loss_clip": 1.22414136, + "balance_loss_mlp": 1.11862051, + "epoch": 0.10052305657428004, + "flos": 17605251701280.0, + "grad_norm": 2.69583458219012, + "language_loss": 0.76555967, + "learning_rate": 3.9481328460556326e-06, + "loss": 0.79645216, + "num_input_tokens_seen": 17847665, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.74804688, + "step": 836, + "time_per_iteration": 2.933607339859009 + }, + { + "auxiliary_loss_clip": 0.01601507, + "auxiliary_loss_mlp": 0.01482579, + "balance_loss_clip": 1.22961307, + "balance_loss_mlp": 1.11083651, + "epoch": 0.10064329946491914, + "flos": 18662092516800.0, + "grad_norm": 2.490394198594864, + "language_loss": 0.89620131, + "learning_rate": 3.9479564464212455e-06, + "loss": 0.92704219, + "num_input_tokens_seen": 17866825, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.71484375, + "step": 837, + "time_per_iteration": 2.979444980621338 + }, + { + "auxiliary_loss_clip": 0.01598706, + "auxiliary_loss_mlp": 0.01476238, + "balance_loss_clip": 1.2284404, + "balance_loss_mlp": 1.10335183, + "epoch": 0.10076354235555823, + "flos": 17200819723680.0, + "grad_norm": 2.699737263928195, + "language_loss": 0.76480681, + "learning_rate": 3.947779751283414e-06, + "loss": 0.79555625, + "num_input_tokens_seen": 17883995, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.72265625, + "step": 838, + "time_per_iteration": 2.960392951965332 + }, + { + "auxiliary_loss_clip": 0.01605687, + "auxiliary_loss_mlp": 0.01496526, + "balance_loss_clip": 1.23434079, + "balance_loss_mlp": 1.12325788, + "epoch": 0.10088378524619732, + "flos": 22964250839040.0, + "grad_norm": 2.6478580609513434, + "language_loss": 0.76193494, + "learning_rate": 3.947602760668944e-06, + "loss": 0.79295707, + "num_input_tokens_seen": 17903785, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.73242188, + "step": 839, + "time_per_iteration": 3.0007946491241455 + }, + { + "auxiliary_loss_clip": 0.01596194, + "auxiliary_loss_mlp": 0.01485376, + "balance_loss_clip": 1.22452235, + "balance_loss_mlp": 1.11268044, + "epoch": 0.10100402813683641, + "flos": 37888713596160.0, + "grad_norm": 5.0481812808611, + "language_loss": 0.71400338, + "learning_rate": 3.947425474604684e-06, + "loss": 0.74481905, + "num_input_tokens_seen": 17927720, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.72265625, + "step": 840, + "time_per_iteration": 3.961244583129883 + }, + { + "auxiliary_loss_clip": 0.01607346, + "auxiliary_loss_mlp": 0.01484851, + "balance_loss_clip": 1.2361263, + "balance_loss_mlp": 1.11749554, + "epoch": 0.1011242710274755, + "flos": 21545609662080.0, + "grad_norm": 2.017937701149057, + "language_loss": 0.92459577, + "learning_rate": 3.947247893117528e-06, + "loss": 0.95551777, + "num_input_tokens_seen": 17946225, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.66992188, + "step": 841, + "time_per_iteration": 3.0848896503448486 + }, + { + "auxiliary_loss_clip": 0.01596972, + "auxiliary_loss_mlp": 0.01479844, + "balance_loss_clip": 1.22476029, + "balance_loss_mlp": 1.10199833, + "epoch": 0.10124451391811459, + "flos": 13622906903040.0, + "grad_norm": 4.630778919983364, + "language_loss": 0.69523299, + "learning_rate": 3.947070016234413e-06, + "loss": 0.72600108, + "num_input_tokens_seen": 17962015, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.7734375, + "step": 842, + "time_per_iteration": 2.9715278148651123 + }, + { + "auxiliary_loss_clip": 0.01600438, + "auxiliary_loss_mlp": 0.01487238, + "balance_loss_clip": 1.2277205, + "balance_loss_mlp": 1.10862923, + "epoch": 0.10136475680875369, + "flos": 16650818010720.0, + "grad_norm": 3.3882094889939203, + "language_loss": 0.75021291, + "learning_rate": 3.946891843982326e-06, + "loss": 0.78108966, + "num_input_tokens_seen": 17979680, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.78320312, + "step": 843, + "time_per_iteration": 3.862412929534912 + }, + { + "auxiliary_loss_clip": 0.01601958, + "auxiliary_loss_mlp": 0.01486929, + "balance_loss_clip": 1.22921586, + "balance_loss_mlp": 1.11003661, + "epoch": 0.10148499969939277, + "flos": 19462915702080.0, + "grad_norm": 2.070812730294009, + "language_loss": 0.74538243, + "learning_rate": 3.9467133763882935e-06, + "loss": 0.77627122, + "num_input_tokens_seen": 17998145, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.765625, + "step": 844, + "time_per_iteration": 4.653211355209351 + }, + { + "auxiliary_loss_clip": 0.01597848, + "auxiliary_loss_mlp": 0.01467126, + "balance_loss_clip": 1.22460127, + "balance_loss_mlp": 1.08603764, + "epoch": 0.10160524259003187, + "flos": 21107079904320.0, + "grad_norm": 2.120787693908921, + "language_loss": 0.86397111, + "learning_rate": 3.9465346134793905e-06, + "loss": 0.8946209, + "num_input_tokens_seen": 18017955, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.80664062, + "step": 845, + "time_per_iteration": 2.963433027267456 + }, + { + "auxiliary_loss_clip": 0.01605872, + "auxiliary_loss_mlp": 0.01483325, + "balance_loss_clip": 1.23361373, + "balance_loss_mlp": 1.10795856, + "epoch": 0.10172548548067095, + "flos": 17714637607680.0, + "grad_norm": 2.1471612045788797, + "language_loss": 0.79769671, + "learning_rate": 3.9463555552827335e-06, + "loss": 0.82858866, + "num_input_tokens_seen": 18035125, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.75390625, + "step": 846, + "time_per_iteration": 2.977403402328491 + }, + { + "auxiliary_loss_clip": 0.01598629, + "auxiliary_loss_mlp": 0.01480625, + "balance_loss_clip": 1.22743082, + "balance_loss_mlp": 1.10277903, + "epoch": 0.10184572837131005, + "flos": 21106928191680.0, + "grad_norm": 2.4122398357264188, + "language_loss": 0.86046612, + "learning_rate": 3.946176201825487e-06, + "loss": 0.8912586, + "num_input_tokens_seen": 18053160, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.77539062, + "step": 847, + "time_per_iteration": 2.9077513217926025 + }, + { + "auxiliary_loss_clip": 0.01603281, + "auxiliary_loss_mlp": 0.01482496, + "balance_loss_clip": 1.2320683, + "balance_loss_mlp": 1.10255229, + "epoch": 0.10196597126194913, + "flos": 26069991259680.0, + "grad_norm": 2.109268622282377, + "language_loss": 0.83574706, + "learning_rate": 3.9459965531348575e-06, + "loss": 0.8666048, + "num_input_tokens_seen": 18072815, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.796875, + "step": 848, + "time_per_iteration": 3.032599687576294 + }, + { + "auxiliary_loss_clip": 0.01605994, + "auxiliary_loss_mlp": 0.01483204, + "balance_loss_clip": 1.23579562, + "balance_loss_mlp": 1.10898256, + "epoch": 0.10208621415258823, + "flos": 29317205174400.0, + "grad_norm": 2.306641159579039, + "language_loss": 0.85890889, + "learning_rate": 3.945816609238098e-06, + "loss": 0.88980091, + "num_input_tokens_seen": 18092225, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.73828125, + "step": 849, + "time_per_iteration": 3.0393619537353516 + }, + { + "auxiliary_loss_clip": 0.01605139, + "auxiliary_loss_mlp": 0.01492932, + "balance_loss_clip": 1.23366475, + "balance_loss_mlp": 1.12195325, + "epoch": 0.10220645704322733, + "flos": 23808084922080.0, + "grad_norm": 1.9092626388457354, + "language_loss": 0.8512888, + "learning_rate": 3.945636370162507e-06, + "loss": 0.8822695, + "num_input_tokens_seen": 18112335, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.70703125, + "step": 850, + "time_per_iteration": 3.0191712379455566 + }, + { + "auxiliary_loss_clip": 0.01598746, + "auxiliary_loss_mlp": 0.01489318, + "balance_loss_clip": 1.22781968, + "balance_loss_mlp": 1.11452389, + "epoch": 0.10232669993386641, + "flos": 23220609822720.0, + "grad_norm": 9.795328643211239, + "language_loss": 0.78902644, + "learning_rate": 3.945455835935425e-06, + "loss": 0.81990707, + "num_input_tokens_seen": 18131520, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.74609375, + "step": 851, + "time_per_iteration": 3.022805690765381 + }, + { + "auxiliary_loss_clip": 0.01603143, + "auxiliary_loss_mlp": 0.01475163, + "balance_loss_clip": 1.23248196, + "balance_loss_mlp": 1.10570943, + "epoch": 0.1024469428245055, + "flos": 22924767260160.0, + "grad_norm": 2.2530776025296677, + "language_loss": 0.7539258, + "learning_rate": 3.94527500658424e-06, + "loss": 0.78470892, + "num_input_tokens_seen": 18149185, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.69335938, + "step": 852, + "time_per_iteration": 2.9699203968048096 + }, + { + "auxiliary_loss_clip": 0.01610463, + "auxiliary_loss_mlp": 0.01481394, + "balance_loss_clip": 1.24121809, + "balance_loss_mlp": 1.10144973, + "epoch": 0.10256718571514459, + "flos": 31362274035360.0, + "grad_norm": 2.0709402887372828, + "language_loss": 0.81444073, + "learning_rate": 3.945093882136382e-06, + "loss": 0.84535921, + "num_input_tokens_seen": 18172960, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.79492188, + "step": 853, + "time_per_iteration": 3.0619659423828125 + }, + { + "auxiliary_loss_clip": 0.01606058, + "auxiliary_loss_mlp": 0.01498852, + "balance_loss_clip": 1.23639619, + "balance_loss_mlp": 1.12577462, + "epoch": 0.10268742860578368, + "flos": 23477006734560.0, + "grad_norm": 1.8985223382549934, + "language_loss": 0.84813684, + "learning_rate": 3.944912462619329e-06, + "loss": 0.87918591, + "num_input_tokens_seen": 18191925, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.72851562, + "step": 854, + "time_per_iteration": 2.9844300746917725 + }, + { + "auxiliary_loss_clip": 0.01603072, + "auxiliary_loss_mlp": 0.0150847, + "balance_loss_clip": 1.2319262, + "balance_loss_mlp": 1.13577366, + "epoch": 0.10280767149642277, + "flos": 25522682446080.0, + "grad_norm": 3.5885762665143965, + "language_loss": 0.80902296, + "learning_rate": 3.9447307480606025e-06, + "loss": 0.84013838, + "num_input_tokens_seen": 18212010, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.72265625, + "step": 855, + "time_per_iteration": 2.99114990234375 + }, + { + "auxiliary_loss_clip": 0.01612496, + "auxiliary_loss_mlp": 0.01506375, + "balance_loss_clip": 1.24215913, + "balance_loss_mlp": 1.1361587, + "epoch": 0.10292791438706186, + "flos": 17349613352640.0, + "grad_norm": 2.491560845050699, + "language_loss": 0.90448159, + "learning_rate": 3.944548738487767e-06, + "loss": 0.93567026, + "num_input_tokens_seen": 18229525, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.69921875, + "step": 856, + "time_per_iteration": 3.0150704383850098 + }, + { + "auxiliary_loss_clip": 0.01603127, + "auxiliary_loss_mlp": 0.01483992, + "balance_loss_clip": 1.23150134, + "balance_loss_mlp": 1.11110497, + "epoch": 0.10304815727770096, + "flos": 27055147124160.0, + "grad_norm": 2.416269092463185, + "language_loss": 0.90764105, + "learning_rate": 3.944366433928434e-06, + "loss": 0.93851221, + "num_input_tokens_seen": 18249505, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.72851562, + "step": 857, + "time_per_iteration": 3.031214475631714 + }, + { + "auxiliary_loss_clip": 0.01614228, + "auxiliary_loss_mlp": 0.01491859, + "balance_loss_clip": 1.24436152, + "balance_loss_mlp": 1.12412214, + "epoch": 0.10316840016834004, + "flos": 22784735036160.0, + "grad_norm": 1.7994060548318207, + "language_loss": 0.83663058, + "learning_rate": 3.9441838344102594e-06, + "loss": 0.86769146, + "num_input_tokens_seen": 18269230, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.67382812, + "step": 858, + "time_per_iteration": 3.100882053375244 + }, + { + "auxiliary_loss_clip": 0.01622359, + "auxiliary_loss_mlp": 0.0149509, + "balance_loss_clip": 1.25294745, + "balance_loss_mlp": 1.12773514, + "epoch": 0.10328864305897914, + "flos": 20706706239840.0, + "grad_norm": 2.603630178537138, + "language_loss": 0.67290998, + "learning_rate": 3.944000939960943e-06, + "loss": 0.7040844, + "num_input_tokens_seen": 18287955, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.66992188, + "step": 859, + "time_per_iteration": 3.0352165699005127 + }, + { + "auxiliary_loss_clip": 0.01615977, + "auxiliary_loss_mlp": 0.01491134, + "balance_loss_clip": 1.24816978, + "balance_loss_mlp": 1.11939204, + "epoch": 0.10340888594961822, + "flos": 28481525645760.0, + "grad_norm": 1.6046501008770648, + "language_loss": 0.80115306, + "learning_rate": 3.943817750608229e-06, + "loss": 0.83222413, + "num_input_tokens_seen": 18310505, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.71484375, + "step": 860, + "time_per_iteration": 3.29726243019104 + }, + { + "auxiliary_loss_clip": 0.01616766, + "auxiliary_loss_mlp": 0.0148441, + "balance_loss_clip": 1.24861264, + "balance_loss_mlp": 1.10713661, + "epoch": 0.10352912884025732, + "flos": 13372009574400.0, + "grad_norm": 3.3261384679155808, + "language_loss": 0.82070911, + "learning_rate": 3.943634266379908e-06, + "loss": 0.85172087, + "num_input_tokens_seen": 18327400, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.77148438, + "step": 861, + "time_per_iteration": 3.019839286804199 + }, + { + "auxiliary_loss_clip": 0.01612412, + "auxiliary_loss_mlp": 0.01480202, + "balance_loss_clip": 1.24312115, + "balance_loss_mlp": 1.10674345, + "epoch": 0.10364937173089642, + "flos": 25561028180160.0, + "grad_norm": 1.8723480755262756, + "language_loss": 0.85064852, + "learning_rate": 3.943450487303815e-06, + "loss": 0.88157469, + "num_input_tokens_seen": 18347895, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.73046875, + "step": 862, + "time_per_iteration": 2.969719886779785 + }, + { + "auxiliary_loss_clip": 0.01614802, + "auxiliary_loss_mlp": 0.01483863, + "balance_loss_clip": 1.24517512, + "balance_loss_mlp": 1.11650753, + "epoch": 0.1037696146215355, + "flos": 21217641583680.0, + "grad_norm": 1.9466427678744214, + "language_loss": 0.85160995, + "learning_rate": 3.943266413407827e-06, + "loss": 0.88259661, + "num_input_tokens_seen": 18367170, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.66992188, + "step": 863, + "time_per_iteration": 3.0416274070739746 + }, + { + "auxiliary_loss_clip": 0.01619883, + "auxiliary_loss_mlp": 0.01491229, + "balance_loss_clip": 1.25124145, + "balance_loss_mlp": 1.11662614, + "epoch": 0.1038898575121746, + "flos": 25809498106560.0, + "grad_norm": 1.8280169961153196, + "language_loss": 0.85178888, + "learning_rate": 3.94308204471987e-06, + "loss": 0.8829, + "num_input_tokens_seen": 18386185, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.7421875, + "step": 864, + "time_per_iteration": 2.993995428085327 + }, + { + "auxiliary_loss_clip": 0.01617983, + "auxiliary_loss_mlp": 0.01498957, + "balance_loss_clip": 1.25026393, + "balance_loss_mlp": 1.12721491, + "epoch": 0.10401010040281368, + "flos": 19064438445600.0, + "grad_norm": 2.7113537048345093, + "language_loss": 0.74856645, + "learning_rate": 3.942897381267912e-06, + "loss": 0.77973586, + "num_input_tokens_seen": 18402550, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.71484375, + "step": 865, + "time_per_iteration": 2.9791650772094727 + }, + { + "auxiliary_loss_clip": 0.0162029, + "auxiliary_loss_mlp": 0.01483092, + "balance_loss_clip": 1.25178325, + "balance_loss_mlp": 1.11649942, + "epoch": 0.10413034329345278, + "flos": 16356568430880.0, + "grad_norm": 2.2517770988152948, + "language_loss": 0.66295207, + "learning_rate": 3.942712423079965e-06, + "loss": 0.69398588, + "num_input_tokens_seen": 18418940, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.6640625, + "step": 866, + "time_per_iteration": 2.9190874099731445 + }, + { + "auxiliary_loss_clip": 0.0160793, + "auxiliary_loss_mlp": 0.01478821, + "balance_loss_clip": 1.2386775, + "balance_loss_mlp": 1.10650682, + "epoch": 0.10425058618409186, + "flos": 17238331038240.0, + "grad_norm": 2.609565888134661, + "language_loss": 0.89694875, + "learning_rate": 3.942527170184088e-06, + "loss": 0.92781627, + "num_input_tokens_seen": 18435560, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.71875, + "step": 867, + "time_per_iteration": 2.899763584136963 + }, + { + "auxiliary_loss_clip": 0.01618227, + "auxiliary_loss_mlp": 0.01482696, + "balance_loss_clip": 1.24991083, + "balance_loss_mlp": 1.11038172, + "epoch": 0.10437082907473096, + "flos": 17969289824160.0, + "grad_norm": 2.7918188633663172, + "language_loss": 0.77701432, + "learning_rate": 3.942341622608385e-06, + "loss": 0.80802351, + "num_input_tokens_seen": 18452590, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.71875, + "step": 868, + "time_per_iteration": 3.7807366847991943 + }, + { + "auxiliary_loss_clip": 0.0162663, + "auxiliary_loss_mlp": 0.01485317, + "balance_loss_clip": 1.25891829, + "balance_loss_mlp": 1.10956979, + "epoch": 0.10449107196537005, + "flos": 36286270734240.0, + "grad_norm": 2.6695759715084346, + "language_loss": 0.77854818, + "learning_rate": 3.942155780381001e-06, + "loss": 0.80966771, + "num_input_tokens_seen": 18476325, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.75390625, + "step": 869, + "time_per_iteration": 3.1614861488342285 + }, + { + "auxiliary_loss_clip": 0.01626807, + "auxiliary_loss_mlp": 0.01489872, + "balance_loss_clip": 1.25928617, + "balance_loss_mlp": 1.1171757, + "epoch": 0.10461131485600914, + "flos": 23804178321600.0, + "grad_norm": 2.002009700056065, + "language_loss": 0.75819141, + "learning_rate": 3.94196964353013e-06, + "loss": 0.78935826, + "num_input_tokens_seen": 18495775, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.7265625, + "step": 870, + "time_per_iteration": 3.0379810333251953 + }, + { + "auxiliary_loss_clip": 0.01625049, + "auxiliary_loss_mlp": 0.01495252, + "balance_loss_clip": 1.25974596, + "balance_loss_mlp": 1.13132977, + "epoch": 0.10473155774664823, + "flos": 18407440300320.0, + "grad_norm": 4.8279393172724525, + "language_loss": 0.80587095, + "learning_rate": 3.941783212084008e-06, + "loss": 0.83707398, + "num_input_tokens_seen": 18513530, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.63671875, + "step": 871, + "time_per_iteration": 3.938800573348999 + }, + { + "auxiliary_loss_clip": 0.01619353, + "auxiliary_loss_mlp": 0.01489965, + "balance_loss_clip": 1.2527709, + "balance_loss_mlp": 1.12051129, + "epoch": 0.10485180063728732, + "flos": 25594974247680.0, + "grad_norm": 3.542604920835966, + "language_loss": 0.78941381, + "learning_rate": 3.941596486070916e-06, + "loss": 0.82050699, + "num_input_tokens_seen": 18531575, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 3.69335938, + "step": 872, + "time_per_iteration": 3.9509525299072266 + }, + { + "auxiliary_loss_clip": 0.01619576, + "auxiliary_loss_mlp": 0.01483497, + "balance_loss_clip": 1.25205755, + "balance_loss_mlp": 1.1125176, + "epoch": 0.10497204352792641, + "flos": 27091179240480.0, + "grad_norm": 3.374278115178305, + "language_loss": 0.58955514, + "learning_rate": 3.941409465519182e-06, + "loss": 0.6205858, + "num_input_tokens_seen": 18552100, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.70703125, + "step": 873, + "time_per_iteration": 3.0163092613220215 + }, + { + "auxiliary_loss_clip": 0.01611287, + "auxiliary_loss_mlp": 0.01501128, + "balance_loss_clip": 1.24392831, + "balance_loss_mlp": 1.13625216, + "epoch": 0.10509228641856551, + "flos": 32861513280960.0, + "grad_norm": 1.7505844903928316, + "language_loss": 0.85591888, + "learning_rate": 3.941222150457176e-06, + "loss": 0.887043, + "num_input_tokens_seen": 18575355, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.6484375, + "step": 874, + "time_per_iteration": 3.151426076889038 + }, + { + "auxiliary_loss_clip": 0.01619245, + "auxiliary_loss_mlp": 0.01467705, + "balance_loss_clip": 1.25302672, + "balance_loss_mlp": 1.09024036, + "epoch": 0.10521252930920459, + "flos": 14320905753600.0, + "grad_norm": 3.1886425735054527, + "language_loss": 0.71596122, + "learning_rate": 3.941034540913311e-06, + "loss": 0.74683082, + "num_input_tokens_seen": 18592885, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 3.77148438, + "step": 875, + "time_per_iteration": 3.014413833618164 + }, + { + "auxiliary_loss_clip": 0.01615988, + "auxiliary_loss_mlp": 0.01500637, + "balance_loss_clip": 1.24958277, + "balance_loss_mlp": 1.12679613, + "epoch": 0.10533277219984369, + "flos": 21689055420480.0, + "grad_norm": 1.7143586604558805, + "language_loss": 0.82773, + "learning_rate": 3.940846636916051e-06, + "loss": 0.85889626, + "num_input_tokens_seen": 18612920, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 3.73632812, + "step": 876, + "time_per_iteration": 2.913494110107422 + }, + { + "auxiliary_loss_clip": 0.01619782, + "auxiliary_loss_mlp": 0.01471986, + "balance_loss_clip": 1.25154757, + "balance_loss_mlp": 1.09661961, + "epoch": 0.10545301509048277, + "flos": 22271789499840.0, + "grad_norm": 2.1331674397123095, + "language_loss": 0.86756146, + "learning_rate": 3.940658438493899e-06, + "loss": 0.8984791, + "num_input_tokens_seen": 18630765, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.75195312, + "step": 877, + "time_per_iteration": 3.0028889179229736 + }, + { + "auxiliary_loss_clip": 0.01607766, + "auxiliary_loss_mlp": 0.01491536, + "balance_loss_clip": 1.23909616, + "balance_loss_mlp": 1.11273694, + "epoch": 0.10557325798112187, + "flos": 22201887172320.0, + "grad_norm": 3.2985176328111323, + "language_loss": 0.76334918, + "learning_rate": 3.940469945675405e-06, + "loss": 0.79434222, + "num_input_tokens_seen": 18649150, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.78320312, + "step": 878, + "time_per_iteration": 2.984618902206421 + }, + { + "auxiliary_loss_clip": 0.01621859, + "auxiliary_loss_mlp": 0.01491915, + "balance_loss_clip": 1.25507557, + "balance_loss_mlp": 1.11445105, + "epoch": 0.10569350087176095, + "flos": 25778282866560.0, + "grad_norm": 1.9550074451759856, + "language_loss": 0.91294014, + "learning_rate": 3.940281158489163e-06, + "loss": 0.94407785, + "num_input_tokens_seen": 18668380, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.77148438, + "step": 879, + "time_per_iteration": 3.0868396759033203 + }, + { + "auxiliary_loss_clip": 0.01616259, + "auxiliary_loss_mlp": 0.01478851, + "balance_loss_clip": 1.24717295, + "balance_loss_mlp": 1.09528303, + "epoch": 0.10581374376240005, + "flos": 17313315739200.0, + "grad_norm": 3.1852566512356, + "language_loss": 0.82863057, + "learning_rate": 3.940092076963812e-06, + "loss": 0.85958165, + "num_input_tokens_seen": 18685875, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.83203125, + "step": 880, + "time_per_iteration": 3.1671738624572754 + }, + { + "auxiliary_loss_clip": 0.01622601, + "auxiliary_loss_mlp": 0.01497289, + "balance_loss_clip": 1.25502706, + "balance_loss_mlp": 1.12650061, + "epoch": 0.10593398665303914, + "flos": 34352294546880.0, + "grad_norm": 2.2651566602795308, + "language_loss": 0.7895937, + "learning_rate": 3.9399027011280355e-06, + "loss": 0.82079262, + "num_input_tokens_seen": 18707970, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.70507812, + "step": 881, + "time_per_iteration": 3.0349769592285156 + }, + { + "auxiliary_loss_clip": 0.01611791, + "auxiliary_loss_mlp": 0.01479347, + "balance_loss_clip": 1.2415601, + "balance_loss_mlp": 1.09635186, + "epoch": 0.10605422954367823, + "flos": 23260207186080.0, + "grad_norm": 2.994567786420054, + "language_loss": 0.7727133, + "learning_rate": 3.939713031010561e-06, + "loss": 0.80362469, + "num_input_tokens_seen": 18726335, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.82617188, + "step": 882, + "time_per_iteration": 2.942671775817871 + }, + { + "auxiliary_loss_clip": 0.01611803, + "auxiliary_loss_mlp": 0.01482397, + "balance_loss_clip": 1.24291861, + "balance_loss_mlp": 1.10188103, + "epoch": 0.10617447243431732, + "flos": 22822322207040.0, + "grad_norm": 2.0892273058297923, + "language_loss": 0.78002119, + "learning_rate": 3.939523066640163e-06, + "loss": 0.81096327, + "num_input_tokens_seen": 18745230, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.80078125, + "step": 883, + "time_per_iteration": 2.933262348175049 + }, + { + "auxiliary_loss_clip": 0.01605318, + "auxiliary_loss_mlp": 0.01494741, + "balance_loss_clip": 1.23558164, + "balance_loss_mlp": 1.1121273, + "epoch": 0.10629471532495641, + "flos": 24388732952640.0, + "grad_norm": 1.826229535341272, + "language_loss": 0.81298786, + "learning_rate": 3.939332808045657e-06, + "loss": 0.84398848, + "num_input_tokens_seen": 18764880, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.82421875, + "step": 884, + "time_per_iteration": 2.9349942207336426 + }, + { + "auxiliary_loss_clip": 0.01609194, + "auxiliary_loss_mlp": 0.01488318, + "balance_loss_clip": 1.23970866, + "balance_loss_mlp": 1.11543155, + "epoch": 0.1064149582155955, + "flos": 21107610898560.0, + "grad_norm": 2.3892054972083314, + "language_loss": 0.84388179, + "learning_rate": 3.939142255255906e-06, + "loss": 0.87485683, + "num_input_tokens_seen": 18785765, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.72460938, + "step": 885, + "time_per_iteration": 2.953016996383667 + }, + { + "auxiliary_loss_clip": 0.01615719, + "auxiliary_loss_mlp": 0.01477622, + "balance_loss_clip": 1.24732828, + "balance_loss_mlp": 1.09214675, + "epoch": 0.1065352011062346, + "flos": 20704013340480.0, + "grad_norm": 2.468800351128067, + "language_loss": 0.8691498, + "learning_rate": 3.938951408299817e-06, + "loss": 0.9000833, + "num_input_tokens_seen": 18804605, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.85546875, + "step": 886, + "time_per_iteration": 2.886561393737793 + }, + { + "auxiliary_loss_clip": 0.01674909, + "auxiliary_loss_mlp": 0.0144574, + "balance_loss_clip": 1.31482995, + "balance_loss_mlp": 1.09631348, + "epoch": 0.10665544399687368, + "flos": 62665948334880.0, + "grad_norm": 0.8248579851271072, + "language_loss": 0.54422009, + "learning_rate": 3.938760267206342e-06, + "loss": 0.57542658, + "num_input_tokens_seen": 18866425, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 3.5, + "step": 887, + "time_per_iteration": 3.345921516418457 + }, + { + "auxiliary_loss_clip": 0.01616676, + "auxiliary_loss_mlp": 0.01489683, + "balance_loss_clip": 1.24781966, + "balance_loss_mlp": 1.10878527, + "epoch": 0.10677568688751278, + "flos": 26142776127360.0, + "grad_norm": 2.7674207234592236, + "language_loss": 0.79439533, + "learning_rate": 3.938568832004475e-06, + "loss": 0.82545888, + "num_input_tokens_seen": 18885130, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.80664062, + "step": 888, + "time_per_iteration": 2.9725100994110107 + }, + { + "auxiliary_loss_clip": 0.01599609, + "auxiliary_loss_mlp": 0.01481278, + "balance_loss_clip": 1.22988057, + "balance_loss_mlp": 1.10114324, + "epoch": 0.10689592977815186, + "flos": 12788175578400.0, + "grad_norm": 9.219778854342616, + "language_loss": 0.75635844, + "learning_rate": 3.938377102723257e-06, + "loss": 0.78716731, + "num_input_tokens_seen": 18902265, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.79882812, + "step": 889, + "time_per_iteration": 2.8592207431793213 + }, + { + "auxiliary_loss_clip": 0.01617088, + "auxiliary_loss_mlp": 0.01494537, + "balance_loss_clip": 1.24937916, + "balance_loss_mlp": 1.11573768, + "epoch": 0.10701617266879096, + "flos": 22128836807520.0, + "grad_norm": 2.625113734686742, + "language_loss": 0.83629286, + "learning_rate": 3.938185079391774e-06, + "loss": 0.86740911, + "num_input_tokens_seen": 18919310, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.78710938, + "step": 890, + "time_per_iteration": 3.0007224082946777 + }, + { + "auxiliary_loss_clip": 0.0160191, + "auxiliary_loss_mlp": 0.01489401, + "balance_loss_clip": 1.2316587, + "balance_loss_mlp": 1.11327183, + "epoch": 0.10713641555943004, + "flos": 19747114319520.0, + "grad_norm": 4.239495318696148, + "language_loss": 1.05671048, + "learning_rate": 3.937992762039157e-06, + "loss": 1.0876236, + "num_input_tokens_seen": 18932635, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.75976562, + "step": 891, + "time_per_iteration": 2.989598274230957 + }, + { + "auxiliary_loss_clip": 0.01606806, + "auxiliary_loss_mlp": 0.01479023, + "balance_loss_clip": 1.2383908, + "balance_loss_mlp": 1.10041404, + "epoch": 0.10725665845006914, + "flos": 23955361424640.0, + "grad_norm": 1.6530988302274163, + "language_loss": 0.80691326, + "learning_rate": 3.937800150694577e-06, + "loss": 0.83777153, + "num_input_tokens_seen": 18953810, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.78320312, + "step": 892, + "time_per_iteration": 3.1435980796813965 + }, + { + "auxiliary_loss_clip": 0.01614301, + "auxiliary_loss_mlp": 0.01487188, + "balance_loss_clip": 1.24619687, + "balance_loss_mlp": 1.11544538, + "epoch": 0.10737690134070824, + "flos": 18553692742560.0, + "grad_norm": 3.052945451502732, + "language_loss": 0.76098549, + "learning_rate": 3.937607245387255e-06, + "loss": 0.79200041, + "num_input_tokens_seen": 18973175, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.71484375, + "step": 893, + "time_per_iteration": 2.9993629455566406 + }, + { + "auxiliary_loss_clip": 0.01612442, + "auxiliary_loss_mlp": 0.01478334, + "balance_loss_clip": 1.24288654, + "balance_loss_mlp": 1.10849953, + "epoch": 0.10749714423134732, + "flos": 22709636550720.0, + "grad_norm": 4.061747014477386, + "language_loss": 0.72271168, + "learning_rate": 3.937414046146455e-06, + "loss": 0.75361943, + "num_input_tokens_seen": 18991130, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.69726562, + "step": 894, + "time_per_iteration": 2.952162742614746 + }, + { + "auxiliary_loss_clip": 0.01609784, + "auxiliary_loss_mlp": 0.01493856, + "balance_loss_clip": 1.23978639, + "balance_loss_mlp": 1.12840784, + "epoch": 0.10761738712198642, + "flos": 21108217749120.0, + "grad_norm": 2.0391030185879373, + "language_loss": 0.75880671, + "learning_rate": 3.9372205530014845e-06, + "loss": 0.78984308, + "num_input_tokens_seen": 19009610, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.65429688, + "step": 895, + "time_per_iteration": 2.923506259918213 + }, + { + "auxiliary_loss_clip": 0.01601489, + "auxiliary_loss_mlp": 0.01495109, + "balance_loss_clip": 1.23273802, + "balance_loss_mlp": 1.12641895, + "epoch": 0.1077376300126255, + "flos": 23768866840320.0, + "grad_norm": 2.5515925894040166, + "language_loss": 0.71751559, + "learning_rate": 3.937026765981696e-06, + "loss": 0.74848157, + "num_input_tokens_seen": 19029680, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.68359375, + "step": 896, + "time_per_iteration": 3.7722866535186768 + }, + { + "auxiliary_loss_clip": 0.01617334, + "auxiliary_loss_mlp": 0.01504786, + "balance_loss_clip": 1.248559, + "balance_loss_mlp": 1.13418818, + "epoch": 0.1078578729032646, + "flos": 20921495595840.0, + "grad_norm": 2.314062588145528, + "language_loss": 0.79667139, + "learning_rate": 3.936832685116488e-06, + "loss": 0.8278926, + "num_input_tokens_seen": 19047775, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.703125, + "step": 897, + "time_per_iteration": 2.9093453884124756 + }, + { + "auxiliary_loss_clip": 0.01613783, + "auxiliary_loss_mlp": 0.01501192, + "balance_loss_clip": 1.24605322, + "balance_loss_mlp": 1.14070272, + "epoch": 0.10797811579390369, + "flos": 14831651456640.0, + "grad_norm": 3.0454071135714504, + "language_loss": 0.90203559, + "learning_rate": 3.936638310435301e-06, + "loss": 0.93318534, + "num_input_tokens_seen": 19065640, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.60546875, + "step": 898, + "time_per_iteration": 2.953727960586548 + }, + { + "auxiliary_loss_clip": 0.01606889, + "auxiliary_loss_mlp": 0.01514658, + "balance_loss_clip": 1.23720276, + "balance_loss_mlp": 1.15264344, + "epoch": 0.10809835868454278, + "flos": 19539038247840.0, + "grad_norm": 2.2235969214656084, + "language_loss": 0.81563675, + "learning_rate": 3.936443641967623e-06, + "loss": 0.84685218, + "num_input_tokens_seen": 19084470, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.61914062, + "step": 899, + "time_per_iteration": 4.657940149307251 + }, + { + "auxiliary_loss_clip": 0.01605462, + "auxiliary_loss_mlp": 0.01475514, + "balance_loss_clip": 1.23769665, + "balance_loss_mlp": 1.11178267, + "epoch": 0.10821860157518187, + "flos": 18444306836160.0, + "grad_norm": 2.283752681709528, + "language_loss": 0.82796544, + "learning_rate": 3.936248679742983e-06, + "loss": 0.8587752, + "num_input_tokens_seen": 19102965, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.63476562, + "step": 900, + "time_per_iteration": 3.9968972206115723 + }, + { + "auxiliary_loss_clip": 0.01664135, + "auxiliary_loss_mlp": 0.01518318, + "balance_loss_clip": 1.30390024, + "balance_loss_mlp": 1.21466827, + "epoch": 0.10833884446582095, + "flos": 49363954872480.0, + "grad_norm": 1.1169122826871192, + "language_loss": 0.70159757, + "learning_rate": 3.936053423790959e-06, + "loss": 0.7334221, + "num_input_tokens_seen": 19151285, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 3.03125, + "step": 901, + "time_per_iteration": 3.379337787628174 + }, + { + "auxiliary_loss_clip": 0.01605218, + "auxiliary_loss_mlp": 0.01493113, + "balance_loss_clip": 1.23791921, + "balance_loss_mlp": 1.12480354, + "epoch": 0.10845908735646005, + "flos": 20413973786400.0, + "grad_norm": 2.455179694442795, + "language_loss": 0.77148306, + "learning_rate": 3.935857874141168e-06, + "loss": 0.80246639, + "num_input_tokens_seen": 19170120, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.67773438, + "step": 902, + "time_per_iteration": 2.9893758296966553 + }, + { + "auxiliary_loss_clip": 0.01603035, + "auxiliary_loss_mlp": 0.01488771, + "balance_loss_clip": 1.23336923, + "balance_loss_mlp": 1.11855507, + "epoch": 0.10857933024709913, + "flos": 14029500785760.0, + "grad_norm": 2.964215350777075, + "language_loss": 0.83774781, + "learning_rate": 3.935662030823279e-06, + "loss": 0.86866581, + "num_input_tokens_seen": 19186305, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.69921875, + "step": 903, + "time_per_iteration": 2.918752908706665 + }, + { + "auxiliary_loss_clip": 0.01606, + "auxiliary_loss_mlp": 0.01485497, + "balance_loss_clip": 1.23800898, + "balance_loss_mlp": 1.10726976, + "epoch": 0.10869957313773823, + "flos": 13370606232480.0, + "grad_norm": 2.6153704883254187, + "language_loss": 0.72500205, + "learning_rate": 3.935465893866998e-06, + "loss": 0.75591707, + "num_input_tokens_seen": 19204530, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.77929688, + "step": 904, + "time_per_iteration": 2.887540340423584 + }, + { + "auxiliary_loss_clip": 0.01608451, + "auxiliary_loss_mlp": 0.01489379, + "balance_loss_clip": 1.24108136, + "balance_loss_mlp": 1.11324954, + "epoch": 0.10881981602837733, + "flos": 25809194681280.0, + "grad_norm": 2.5844452202742594, + "language_loss": 0.80059958, + "learning_rate": 3.935269463302079e-06, + "loss": 0.83157784, + "num_input_tokens_seen": 19222735, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.75976562, + "step": 905, + "time_per_iteration": 2.9192774295806885 + }, + { + "auxiliary_loss_clip": 0.01609645, + "auxiliary_loss_mlp": 0.01488003, + "balance_loss_clip": 1.24167895, + "balance_loss_mlp": 1.10634267, + "epoch": 0.10894005891901641, + "flos": 20779680748320.0, + "grad_norm": 2.675689245820799, + "language_loss": 0.76890403, + "learning_rate": 3.935072739158322e-06, + "loss": 0.7998805, + "num_input_tokens_seen": 19242445, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.8125, + "step": 906, + "time_per_iteration": 2.8784780502319336 + }, + { + "auxiliary_loss_clip": 0.01602041, + "auxiliary_loss_mlp": 0.01479312, + "balance_loss_clip": 1.23362648, + "balance_loss_mlp": 1.10165715, + "epoch": 0.10906030180965551, + "flos": 26652232272960.0, + "grad_norm": 1.8528645374700077, + "language_loss": 0.79831052, + "learning_rate": 3.934875721465569e-06, + "loss": 0.82912397, + "num_input_tokens_seen": 19262865, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.77539062, + "step": 907, + "time_per_iteration": 2.9558136463165283 + }, + { + "auxiliary_loss_clip": 0.01603824, + "auxiliary_loss_mlp": 0.01480107, + "balance_loss_clip": 1.23515081, + "balance_loss_mlp": 1.09711158, + "epoch": 0.10918054470029459, + "flos": 36537130134720.0, + "grad_norm": 2.7805179817087833, + "language_loss": 0.71839255, + "learning_rate": 3.9346784102537076e-06, + "loss": 0.74923182, + "num_input_tokens_seen": 19285000, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.82421875, + "step": 908, + "time_per_iteration": 3.0140891075134277 + }, + { + "auxiliary_loss_clip": 0.01600925, + "auxiliary_loss_mlp": 0.01477217, + "balance_loss_clip": 1.23468184, + "balance_loss_mlp": 1.07858086, + "epoch": 0.10930078759093369, + "flos": 21764722828320.0, + "grad_norm": 2.003369781830168, + "language_loss": 0.7852546, + "learning_rate": 3.934480805552669e-06, + "loss": 0.81603599, + "num_input_tokens_seen": 19306010, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 3.98828125, + "step": 909, + "time_per_iteration": 2.971510171890259 + }, + { + "auxiliary_loss_clip": 0.01603952, + "auxiliary_loss_mlp": 0.01499947, + "balance_loss_clip": 1.23688245, + "balance_loss_mlp": 1.1112293, + "epoch": 0.10942103048157277, + "flos": 22604005532160.0, + "grad_norm": 2.65905478846876, + "language_loss": 0.88350773, + "learning_rate": 3.93428290739243e-06, + "loss": 0.91454673, + "num_input_tokens_seen": 19325380, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.88476562, + "step": 910, + "time_per_iteration": 2.947561502456665 + }, + { + "auxiliary_loss_clip": 0.01597172, + "auxiliary_loss_mlp": 0.01479017, + "balance_loss_clip": 1.22948837, + "balance_loss_mlp": 1.08705676, + "epoch": 0.10954127337221187, + "flos": 15047389016640.0, + "grad_norm": 2.6442759104638416, + "language_loss": 0.80101818, + "learning_rate": 3.9340847158030125e-06, + "loss": 0.83178002, + "num_input_tokens_seen": 19338960, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.91796875, + "step": 911, + "time_per_iteration": 2.9363560676574707 + }, + { + "auxiliary_loss_clip": 0.01587398, + "auxiliary_loss_mlp": 0.01496693, + "balance_loss_clip": 1.22071767, + "balance_loss_mlp": 1.09309769, + "epoch": 0.10966151626285096, + "flos": 21653023304160.0, + "grad_norm": 1.9974668018209563, + "language_loss": 0.75806046, + "learning_rate": 3.9338862308144814e-06, + "loss": 0.78890133, + "num_input_tokens_seen": 19357780, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 4.03515625, + "step": 912, + "time_per_iteration": 2.9503231048583984 + }, + { + "auxiliary_loss_clip": 0.01589622, + "auxiliary_loss_mlp": 0.01485362, + "balance_loss_clip": 1.22358108, + "balance_loss_mlp": 1.09664392, + "epoch": 0.10978175915349005, + "flos": 20123479094400.0, + "grad_norm": 1.7031487414706818, + "language_loss": 0.84603703, + "learning_rate": 3.933687452456946e-06, + "loss": 0.87678683, + "num_input_tokens_seen": 19377680, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.88671875, + "step": 913, + "time_per_iteration": 3.042356491088867 + }, + { + "auxiliary_loss_clip": 0.01591184, + "auxiliary_loss_mlp": 0.01486486, + "balance_loss_clip": 1.22288561, + "balance_loss_mlp": 1.09643328, + "epoch": 0.10990200204412914, + "flos": 20414770277760.0, + "grad_norm": 3.521993485258445, + "language_loss": 0.86660904, + "learning_rate": 3.933488380760562e-06, + "loss": 0.89738578, + "num_input_tokens_seen": 19397040, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.90039062, + "step": 914, + "time_per_iteration": 2.994342088699341 + }, + { + "auxiliary_loss_clip": 0.01594665, + "auxiliary_loss_mlp": 0.01484658, + "balance_loss_clip": 1.22837615, + "balance_loss_mlp": 1.09613132, + "epoch": 0.11002224493476823, + "flos": 17532049623840.0, + "grad_norm": 2.33996100186094, + "language_loss": 0.87558651, + "learning_rate": 3.9332890157555286e-06, + "loss": 0.9063797, + "num_input_tokens_seen": 19413975, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 3.8828125, + "step": 915, + "time_per_iteration": 2.8739395141601562 + }, + { + "auxiliary_loss_clip": 0.01597236, + "auxiliary_loss_mlp": 0.01467199, + "balance_loss_clip": 1.23115206, + "balance_loss_mlp": 1.08305883, + "epoch": 0.11014248782540732, + "flos": 12204720864000.0, + "grad_norm": 2.803255711226584, + "language_loss": 0.76580894, + "learning_rate": 3.933089357472088e-06, + "loss": 0.7964533, + "num_input_tokens_seen": 19432005, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.83984375, + "step": 916, + "time_per_iteration": 2.9287381172180176 + }, + { + "auxiliary_loss_clip": 0.01596121, + "auxiliary_loss_mlp": 0.01481765, + "balance_loss_clip": 1.22990656, + "balance_loss_mlp": 1.09495425, + "epoch": 0.11026273071604642, + "flos": 22385195791200.0, + "grad_norm": 1.9981970161620304, + "language_loss": 0.86234862, + "learning_rate": 3.932889405940529e-06, + "loss": 0.89312744, + "num_input_tokens_seen": 19450100, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.86523438, + "step": 917, + "time_per_iteration": 3.1188406944274902 + }, + { + "auxiliary_loss_clip": 0.01602175, + "auxiliary_loss_mlp": 0.01497868, + "balance_loss_clip": 1.23705149, + "balance_loss_mlp": 1.12498152, + "epoch": 0.1103829736066855, + "flos": 19831125922560.0, + "grad_norm": 6.190110140166082, + "language_loss": 0.80104172, + "learning_rate": 3.932689161191184e-06, + "loss": 0.83204216, + "num_input_tokens_seen": 19467805, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.72460938, + "step": 918, + "time_per_iteration": 2.8859646320343018 + }, + { + "auxiliary_loss_clip": 0.01590601, + "auxiliary_loss_mlp": 0.01484819, + "balance_loss_clip": 1.22516751, + "balance_loss_mlp": 1.10620999, + "epoch": 0.1105032164973246, + "flos": 22671556313760.0, + "grad_norm": 2.461264106271353, + "language_loss": 0.88289928, + "learning_rate": 3.93248862325443e-06, + "loss": 0.91365349, + "num_input_tokens_seen": 19486710, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.78320312, + "step": 919, + "time_per_iteration": 2.9542794227600098 + }, + { + "auxiliary_loss_clip": 0.01628909, + "auxiliary_loss_mlp": 0.01446732, + "balance_loss_clip": 1.27220654, + "balance_loss_mlp": 1.07365417, + "epoch": 0.11062345938796368, + "flos": 66489638182560.0, + "grad_norm": 0.9600435476609395, + "language_loss": 0.64409781, + "learning_rate": 3.932287792160688e-06, + "loss": 0.67485416, + "num_input_tokens_seen": 19545170, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 3.7265625, + "step": 920, + "time_per_iteration": 3.427727699279785 + }, + { + "auxiliary_loss_clip": 0.01593771, + "auxiliary_loss_mlp": 0.01496318, + "balance_loss_clip": 1.22813237, + "balance_loss_mlp": 1.1318233, + "epoch": 0.11074370227860278, + "flos": 21909913282080.0, + "grad_norm": 5.52945114961642, + "language_loss": 0.80958652, + "learning_rate": 3.932086667940424e-06, + "loss": 0.84048742, + "num_input_tokens_seen": 19561875, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.64257812, + "step": 921, + "time_per_iteration": 2.9414055347442627 + }, + { + "auxiliary_loss_clip": 0.01592735, + "auxiliary_loss_mlp": 0.01495529, + "balance_loss_clip": 1.22794056, + "balance_loss_mlp": 1.13179779, + "epoch": 0.11086394516924186, + "flos": 28660662167040.0, + "grad_norm": 4.709485477664381, + "language_loss": 0.8188591, + "learning_rate": 3.93188525062415e-06, + "loss": 0.8497417, + "num_input_tokens_seen": 19582340, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.63671875, + "step": 922, + "time_per_iteration": 2.982247829437256 + }, + { + "auxiliary_loss_clip": 0.01591947, + "auxiliary_loss_mlp": 0.01521165, + "balance_loss_clip": 1.22772086, + "balance_loss_mlp": 1.16067648, + "epoch": 0.11098418805988096, + "flos": 24537564509760.0, + "grad_norm": 2.6057483099496688, + "language_loss": 0.8636837, + "learning_rate": 3.931683540242418e-06, + "loss": 0.89481485, + "num_input_tokens_seen": 19603405, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.60351562, + "step": 923, + "time_per_iteration": 2.9562530517578125 + }, + { + "auxiliary_loss_clip": 0.01597586, + "auxiliary_loss_mlp": 0.01503046, + "balance_loss_clip": 1.23303497, + "balance_loss_mlp": 1.13549995, + "epoch": 0.11110443095052006, + "flos": 22962278574720.0, + "grad_norm": 6.1265722288539886, + "language_loss": 0.90992677, + "learning_rate": 3.9314815368258295e-06, + "loss": 0.94093299, + "num_input_tokens_seen": 19619885, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.67578125, + "step": 924, + "time_per_iteration": 3.7770867347717285 + }, + { + "auxiliary_loss_clip": 0.01609375, + "auxiliary_loss_mlp": 0.01499335, + "balance_loss_clip": 1.24646997, + "balance_loss_mlp": 1.1396091, + "epoch": 0.11122467384115914, + "flos": 18951866573760.0, + "grad_norm": 2.0636386263697526, + "language_loss": 0.79139709, + "learning_rate": 3.9312792404050275e-06, + "loss": 0.82248425, + "num_input_tokens_seen": 19637940, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.59765625, + "step": 925, + "time_per_iteration": 2.9302220344543457 + }, + { + "auxiliary_loss_clip": 0.01595776, + "auxiliary_loss_mlp": 0.01510049, + "balance_loss_clip": 1.23153961, + "balance_loss_mlp": 1.15165854, + "epoch": 0.11134491673179824, + "flos": 25085328461280.0, + "grad_norm": 11.399444957656595, + "language_loss": 0.77410662, + "learning_rate": 3.9310766510107e-06, + "loss": 0.80516487, + "num_input_tokens_seen": 19657115, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.58398438, + "step": 926, + "time_per_iteration": 3.8751869201660156 + }, + { + "auxiliary_loss_clip": 0.01598106, + "auxiliary_loss_mlp": 0.01499216, + "balance_loss_clip": 1.23427892, + "balance_loss_mlp": 1.13643801, + "epoch": 0.11146515962243732, + "flos": 24501456537120.0, + "grad_norm": 1.9263885312795554, + "language_loss": 0.92492652, + "learning_rate": 3.9308737686735806e-06, + "loss": 0.95589972, + "num_input_tokens_seen": 19677075, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 3.62695312, + "step": 927, + "time_per_iteration": 3.91658616065979 + }, + { + "auxiliary_loss_clip": 0.01601145, + "auxiliary_loss_mlp": 0.0152356, + "balance_loss_clip": 1.23635447, + "balance_loss_mlp": 1.17508698, + "epoch": 0.11158540251307641, + "flos": 22345825996800.0, + "grad_norm": 4.364791139615734, + "language_loss": 0.82902074, + "learning_rate": 3.9306705934244455e-06, + "loss": 0.86026776, + "num_input_tokens_seen": 19697155, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.48828125, + "step": 928, + "time_per_iteration": 3.905747413635254 + }, + { + "auxiliary_loss_clip": 0.01602639, + "auxiliary_loss_mlp": 0.01492181, + "balance_loss_clip": 1.23974466, + "balance_loss_mlp": 1.13188243, + "epoch": 0.11170564540371551, + "flos": 19904403856320.0, + "grad_norm": 2.072124675967269, + "language_loss": 0.88343984, + "learning_rate": 3.930467125294116e-06, + "loss": 0.914388, + "num_input_tokens_seen": 19716705, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 3.60351562, + "step": 929, + "time_per_iteration": 3.1979260444641113 + }, + { + "auxiliary_loss_clip": 0.01650899, + "auxiliary_loss_mlp": 0.01475883, + "balance_loss_clip": 1.29497313, + "balance_loss_mlp": 1.15087128, + "epoch": 0.1118258882943546, + "flos": 64592263033920.0, + "grad_norm": 0.9505146780627541, + "language_loss": 0.60440308, + "learning_rate": 3.930263364313458e-06, + "loss": 0.63567084, + "num_input_tokens_seen": 19767275, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 3.25, + "step": 930, + "time_per_iteration": 3.2174909114837646 + }, + { + "auxiliary_loss_clip": 0.01616497, + "auxiliary_loss_mlp": 0.01515767, + "balance_loss_clip": 1.25367343, + "balance_loss_mlp": 1.15794897, + "epoch": 0.11194613118499369, + "flos": 17203891904640.0, + "grad_norm": 2.0857134368163424, + "language_loss": 0.83194625, + "learning_rate": 3.930059310513384e-06, + "loss": 0.86326891, + "num_input_tokens_seen": 19786315, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 3.57617188, + "step": 931, + "time_per_iteration": 3.006075143814087 + }, + { + "auxiliary_loss_clip": 0.0161265, + "auxiliary_loss_mlp": 0.01522681, + "balance_loss_clip": 1.24912, + "balance_loss_mlp": 1.16085732, + "epoch": 0.11206637407563277, + "flos": 31865889244320.0, + "grad_norm": 2.1880003443800113, + "language_loss": 0.83970243, + "learning_rate": 3.929854963924846e-06, + "loss": 0.87105578, + "num_input_tokens_seen": 19806580, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.62109375, + "step": 932, + "time_per_iteration": 3.0294599533081055 + }, + { + "auxiliary_loss_clip": 0.01610176, + "auxiliary_loss_mlp": 0.0151183, + "balance_loss_clip": 1.24579775, + "balance_loss_mlp": 1.15744472, + "epoch": 0.11218661696627187, + "flos": 21947841806400.0, + "grad_norm": 1.9687824527706528, + "language_loss": 0.77364635, + "learning_rate": 3.929650324578845e-06, + "loss": 0.80486643, + "num_input_tokens_seen": 19826045, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.546875, + "step": 933, + "time_per_iteration": 3.104999303817749 + }, + { + "auxiliary_loss_clip": 0.01614727, + "auxiliary_loss_mlp": 0.01481881, + "balance_loss_clip": 1.24976265, + "balance_loss_mlp": 1.1099478, + "epoch": 0.11230685985691095, + "flos": 25880196925440.0, + "grad_norm": 2.9045433898283326, + "language_loss": 0.82013649, + "learning_rate": 3.929445392506423e-06, + "loss": 0.85110259, + "num_input_tokens_seen": 19843985, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.71289062, + "step": 934, + "time_per_iteration": 3.0169870853424072 + }, + { + "auxiliary_loss_clip": 0.01615107, + "auxiliary_loss_mlp": 0.01493542, + "balance_loss_clip": 1.25053835, + "balance_loss_mlp": 1.12275362, + "epoch": 0.11242710274755005, + "flos": 22233405837600.0, + "grad_norm": 2.6424352768522583, + "language_loss": 0.75782865, + "learning_rate": 3.92924016773867e-06, + "loss": 0.78891516, + "num_input_tokens_seen": 19860480, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.70507812, + "step": 935, + "time_per_iteration": 2.908818244934082 + }, + { + "auxiliary_loss_clip": 0.01609998, + "auxiliary_loss_mlp": 0.01487508, + "balance_loss_clip": 1.24463117, + "balance_loss_mlp": 1.11824572, + "epoch": 0.11254734563818915, + "flos": 17714106613440.0, + "grad_norm": 3.2173024546326268, + "language_loss": 0.73290563, + "learning_rate": 3.9290346503067175e-06, + "loss": 0.76388073, + "num_input_tokens_seen": 19877145, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.6875, + "step": 936, + "time_per_iteration": 2.991431951522827 + }, + { + "auxiliary_loss_clip": 0.01608638, + "auxiliary_loss_mlp": 0.01488237, + "balance_loss_clip": 1.24432445, + "balance_loss_mlp": 1.10562277, + "epoch": 0.11266758852882823, + "flos": 54934557400800.0, + "grad_norm": 2.6285953119585272, + "language_loss": 0.79001325, + "learning_rate": 3.9288288402417415e-06, + "loss": 0.82098204, + "num_input_tokens_seen": 19903405, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.82421875, + "step": 937, + "time_per_iteration": 3.2893359661102295 + }, + { + "auxiliary_loss_clip": 0.01609555, + "auxiliary_loss_mlp": 0.01486977, + "balance_loss_clip": 1.24395192, + "balance_loss_mlp": 1.11161041, + "epoch": 0.11278783141946733, + "flos": 18880029910080.0, + "grad_norm": 6.251093627530794, + "language_loss": 0.70454371, + "learning_rate": 3.928622737574964e-06, + "loss": 0.73550904, + "num_input_tokens_seen": 19918740, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.75, + "step": 938, + "time_per_iteration": 3.071906089782715 + }, + { + "auxiliary_loss_clip": 0.01604014, + "auxiliary_loss_mlp": 0.01474621, + "balance_loss_clip": 1.23852372, + "balance_loss_mlp": 1.09200668, + "epoch": 0.11290807431010641, + "flos": 26471692409760.0, + "grad_norm": 1.858757943141767, + "language_loss": 0.9048655, + "learning_rate": 3.928416342337652e-06, + "loss": 0.93565184, + "num_input_tokens_seen": 19938475, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.82421875, + "step": 939, + "time_per_iteration": 3.0644662380218506 + }, + { + "auxiliary_loss_clip": 0.01609661, + "auxiliary_loss_mlp": 0.01477686, + "balance_loss_clip": 1.24464774, + "balance_loss_mlp": 1.09488106, + "epoch": 0.1130283172007455, + "flos": 22712784588000.0, + "grad_norm": 2.7390450806246736, + "language_loss": 0.82785833, + "learning_rate": 3.928209654561113e-06, + "loss": 0.85873187, + "num_input_tokens_seen": 19959310, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.82617188, + "step": 940, + "time_per_iteration": 2.93701434135437 + }, + { + "auxiliary_loss_clip": 0.0161316, + "auxiliary_loss_mlp": 0.0147059, + "balance_loss_clip": 1.2486943, + "balance_loss_mlp": 1.078439, + "epoch": 0.1131485600913846, + "flos": 23222051092800.0, + "grad_norm": 2.9683499487140126, + "language_loss": 0.81498003, + "learning_rate": 3.928002674276703e-06, + "loss": 0.84581751, + "num_input_tokens_seen": 19978700, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.91992188, + "step": 941, + "time_per_iteration": 2.9591360092163086 + }, + { + "auxiliary_loss_clip": 0.01610079, + "auxiliary_loss_mlp": 0.01493285, + "balance_loss_clip": 1.2449882, + "balance_loss_mlp": 1.10494876, + "epoch": 0.11326880298202369, + "flos": 14066025968160.0, + "grad_norm": 3.702826995804878, + "language_loss": 0.75685877, + "learning_rate": 3.92779540151582e-06, + "loss": 0.7878924, + "num_input_tokens_seen": 19995785, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.8828125, + "step": 942, + "time_per_iteration": 2.9017679691314697 + }, + { + "auxiliary_loss_clip": 0.01613313, + "auxiliary_loss_mlp": 0.01499869, + "balance_loss_clip": 1.24901998, + "balance_loss_mlp": 1.11668253, + "epoch": 0.11338904587266278, + "flos": 16327211670720.0, + "grad_norm": 2.3040370061626345, + "language_loss": 0.85966259, + "learning_rate": 3.927587836309907e-06, + "loss": 0.8907944, + "num_input_tokens_seen": 20013615, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.82421875, + "step": 943, + "time_per_iteration": 2.987048387527466 + }, + { + "auxiliary_loss_clip": 0.01604496, + "auxiliary_loss_mlp": 0.01483542, + "balance_loss_clip": 1.23854661, + "balance_loss_mlp": 1.09902, + "epoch": 0.11350928876330187, + "flos": 24428595813120.0, + "grad_norm": 3.1510623580113313, + "language_loss": 0.78490174, + "learning_rate": 3.927379978690452e-06, + "loss": 0.81578213, + "num_input_tokens_seen": 20032880, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.84570312, + "step": 944, + "time_per_iteration": 3.0268664360046387 + }, + { + "auxiliary_loss_clip": 0.01607551, + "auxiliary_loss_mlp": 0.01498084, + "balance_loss_clip": 1.24209559, + "balance_loss_mlp": 1.12119222, + "epoch": 0.11362953165394096, + "flos": 24499294632000.0, + "grad_norm": 2.321179533686979, + "language_loss": 0.87852716, + "learning_rate": 3.927171828688987e-06, + "loss": 0.90958345, + "num_input_tokens_seen": 20052405, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.765625, + "step": 945, + "time_per_iteration": 2.9040369987487793 + }, + { + "auxiliary_loss_clip": 0.01613538, + "auxiliary_loss_mlp": 0.01484858, + "balance_loss_clip": 1.24827051, + "balance_loss_mlp": 1.10682154, + "epoch": 0.11374977454458005, + "flos": 24063116420160.0, + "grad_norm": 2.5135299052584847, + "language_loss": 0.82072675, + "learning_rate": 3.926963386337088e-06, + "loss": 0.85171074, + "num_input_tokens_seen": 20070635, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.77734375, + "step": 946, + "time_per_iteration": 3.0045812129974365 + }, + { + "auxiliary_loss_clip": 0.0160886, + "auxiliary_loss_mlp": 0.01494081, + "balance_loss_clip": 1.24258947, + "balance_loss_mlp": 1.11699772, + "epoch": 0.11387001743521914, + "flos": 39460775637600.0, + "grad_norm": 10.287676536045854, + "language_loss": 0.70056355, + "learning_rate": 3.926754651666375e-06, + "loss": 0.73159301, + "num_input_tokens_seen": 20091195, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.76953125, + "step": 947, + "time_per_iteration": 3.0676300525665283 + }, + { + "auxiliary_loss_clip": 0.01605003, + "auxiliary_loss_mlp": 0.01479749, + "balance_loss_clip": 1.23944938, + "balance_loss_mlp": 1.09484637, + "epoch": 0.11399026032585824, + "flos": 25084797467040.0, + "grad_norm": 2.8826594953033715, + "language_loss": 0.78070605, + "learning_rate": 3.926545624708513e-06, + "loss": 0.8115536, + "num_input_tokens_seen": 20110435, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.84765625, + "step": 948, + "time_per_iteration": 2.9807968139648438 + }, + { + "auxiliary_loss_clip": 0.01607813, + "auxiliary_loss_mlp": 0.01493709, + "balance_loss_clip": 1.24245024, + "balance_loss_mlp": 1.11815166, + "epoch": 0.11411050321649732, + "flos": 17963448887520.0, + "grad_norm": 4.296422717469386, + "language_loss": 0.85589409, + "learning_rate": 3.926336305495213e-06, + "loss": 0.88690925, + "num_input_tokens_seen": 20128995, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.75390625, + "step": 949, + "time_per_iteration": 3.0824358463287354 + }, + { + "auxiliary_loss_clip": 0.01610217, + "auxiliary_loss_mlp": 0.01472826, + "balance_loss_clip": 1.24498129, + "balance_loss_mlp": 1.09612465, + "epoch": 0.11423074610713642, + "flos": 22457601377280.0, + "grad_norm": 2.5735906299559828, + "language_loss": 0.89202261, + "learning_rate": 3.926126694058226e-06, + "loss": 0.92285299, + "num_input_tokens_seen": 20148145, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.76367188, + "step": 950, + "time_per_iteration": 3.0971932411193848 + }, + { + "auxiliary_loss_clip": 0.01606742, + "auxiliary_loss_mlp": 0.01498616, + "balance_loss_clip": 1.24239612, + "balance_loss_mlp": 1.11619258, + "epoch": 0.1143509889977755, + "flos": 19719653967360.0, + "grad_norm": 1.5471508074439675, + "language_loss": 0.82107818, + "learning_rate": 3.92591679042935e-06, + "loss": 0.85213172, + "num_input_tokens_seen": 20168035, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.8203125, + "step": 951, + "time_per_iteration": 3.043452024459839 + }, + { + "auxiliary_loss_clip": 0.01612791, + "auxiliary_loss_mlp": 0.01496779, + "balance_loss_clip": 1.24721944, + "balance_loss_mlp": 1.12084079, + "epoch": 0.1144712318884146, + "flos": 19824602279040.0, + "grad_norm": 3.2225755560681506, + "language_loss": 0.82348919, + "learning_rate": 3.92570659464043e-06, + "loss": 0.85458493, + "num_input_tokens_seen": 20186095, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.7578125, + "step": 952, + "time_per_iteration": 3.7998242378234863 + }, + { + "auxiliary_loss_clip": 0.01601899, + "auxiliary_loss_mlp": 0.01495939, + "balance_loss_clip": 1.23719645, + "balance_loss_mlp": 1.12515032, + "epoch": 0.1145914747790537, + "flos": 14940961506720.0, + "grad_norm": 2.041100477751218, + "language_loss": 0.79560202, + "learning_rate": 3.925496106723349e-06, + "loss": 0.82658041, + "num_input_tokens_seen": 20203535, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.70507812, + "step": 953, + "time_per_iteration": 2.953439950942993 + }, + { + "auxiliary_loss_clip": 0.01606925, + "auxiliary_loss_mlp": 0.01487773, + "balance_loss_clip": 1.24194002, + "balance_loss_mlp": 1.12041724, + "epoch": 0.11471171766969278, + "flos": 19867082182560.0, + "grad_norm": 2.5135891545614184, + "language_loss": 0.83972228, + "learning_rate": 3.9252853267100405e-06, + "loss": 0.87066925, + "num_input_tokens_seen": 20222780, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.671875, + "step": 954, + "time_per_iteration": 4.832277297973633 + }, + { + "auxiliary_loss_clip": 0.01607626, + "auxiliary_loss_mlp": 0.01489949, + "balance_loss_clip": 1.24035537, + "balance_loss_mlp": 1.12373829, + "epoch": 0.11483196056033187, + "flos": 22528565693280.0, + "grad_norm": 2.86390646120953, + "language_loss": 0.83618492, + "learning_rate": 3.9250742546324786e-06, + "loss": 0.86716068, + "num_input_tokens_seen": 20243015, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.66015625, + "step": 955, + "time_per_iteration": 3.056244134902954 + }, + { + "auxiliary_loss_clip": 0.01603982, + "auxiliary_loss_mlp": 0.01498669, + "balance_loss_clip": 1.23745298, + "balance_loss_mlp": 1.13169515, + "epoch": 0.11495220345097096, + "flos": 28222815116160.0, + "grad_norm": 1.8515057146450455, + "language_loss": 0.8673569, + "learning_rate": 3.924862890522683e-06, + "loss": 0.89838338, + "num_input_tokens_seen": 20263025, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 3.66796875, + "step": 956, + "time_per_iteration": 3.89819598197937 + }, + { + "auxiliary_loss_clip": 0.01601376, + "auxiliary_loss_mlp": 0.0149454, + "balance_loss_clip": 1.2345047, + "balance_loss_mlp": 1.12871003, + "epoch": 0.11507244634161005, + "flos": 17494121099520.0, + "grad_norm": 3.135534373646464, + "language_loss": 0.86028433, + "learning_rate": 3.9246512344127174e-06, + "loss": 0.89124346, + "num_input_tokens_seen": 20280685, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.65429688, + "step": 957, + "time_per_iteration": 2.9222357273101807 + }, + { + "auxiliary_loss_clip": 0.01600753, + "auxiliary_loss_mlp": 0.01492081, + "balance_loss_clip": 1.2352519, + "balance_loss_mlp": 1.12529731, + "epoch": 0.11519268923224914, + "flos": 22567138996320.0, + "grad_norm": 2.644858723400987, + "language_loss": 0.81771505, + "learning_rate": 3.9244392863346895e-06, + "loss": 0.84864342, + "num_input_tokens_seen": 20300090, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.66796875, + "step": 958, + "time_per_iteration": 3.035324811935425 + }, + { + "auxiliary_loss_clip": 0.01602853, + "auxiliary_loss_mlp": 0.01495171, + "balance_loss_clip": 1.23598218, + "balance_loss_mlp": 1.12953246, + "epoch": 0.11531293212288823, + "flos": 16984740810240.0, + "grad_norm": 2.0107492912157743, + "language_loss": 0.92500895, + "learning_rate": 3.9242270463207524e-06, + "loss": 0.95598924, + "num_input_tokens_seen": 20318480, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.65429688, + "step": 959, + "time_per_iteration": 2.908388376235962 + }, + { + "auxiliary_loss_clip": 0.01600862, + "auxiliary_loss_mlp": 0.01503112, + "balance_loss_clip": 1.23359179, + "balance_loss_mlp": 1.13518429, + "epoch": 0.11543317501352733, + "flos": 12423644389440.0, + "grad_norm": 3.133939764372657, + "language_loss": 0.85484517, + "learning_rate": 3.924014514403102e-06, + "loss": 0.88588488, + "num_input_tokens_seen": 20334635, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.67773438, + "step": 960, + "time_per_iteration": 3.0036444664001465 + }, + { + "auxiliary_loss_clip": 0.01596589, + "auxiliary_loss_mlp": 0.01498537, + "balance_loss_clip": 1.23003864, + "balance_loss_mlp": 1.13690341, + "epoch": 0.11555341790416641, + "flos": 19823312721600.0, + "grad_norm": 2.216203838324404, + "language_loss": 0.91441047, + "learning_rate": 3.92380169061398e-06, + "loss": 0.94536173, + "num_input_tokens_seen": 20352415, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.6171875, + "step": 961, + "time_per_iteration": 2.9821701049804688 + }, + { + "auxiliary_loss_clip": 0.01597328, + "auxiliary_loss_mlp": 0.01513069, + "balance_loss_clip": 1.22981703, + "balance_loss_mlp": 1.15105438, + "epoch": 0.11567366079480551, + "flos": 25741226689920.0, + "grad_norm": 4.254658015559066, + "language_loss": 0.84149504, + "learning_rate": 3.9235885749856705e-06, + "loss": 0.87259901, + "num_input_tokens_seen": 20371095, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.61914062, + "step": 962, + "time_per_iteration": 2.9584009647369385 + }, + { + "auxiliary_loss_clip": 0.01593139, + "auxiliary_loss_mlp": 0.01497147, + "balance_loss_clip": 1.22548032, + "balance_loss_mlp": 1.13722992, + "epoch": 0.1157939036854446, + "flos": 18225269526240.0, + "grad_norm": 2.5100461551293067, + "language_loss": 0.82698858, + "learning_rate": 3.9233751675505035e-06, + "loss": 0.85789144, + "num_input_tokens_seen": 20389805, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.59765625, + "step": 963, + "time_per_iteration": 2.9909744262695312 + }, + { + "auxiliary_loss_clip": 0.01608772, + "auxiliary_loss_mlp": 0.01497098, + "balance_loss_clip": 1.23984134, + "balance_loss_mlp": 1.12726331, + "epoch": 0.11591414657608369, + "flos": 23075874506880.0, + "grad_norm": 3.166863867690409, + "language_loss": 0.85032284, + "learning_rate": 3.923161468340853e-06, + "loss": 0.88138151, + "num_input_tokens_seen": 20409640, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.69335938, + "step": 964, + "time_per_iteration": 2.968754291534424 + }, + { + "auxiliary_loss_clip": 0.01592354, + "auxiliary_loss_mlp": 0.0150209, + "balance_loss_clip": 1.2239145, + "balance_loss_mlp": 1.13397181, + "epoch": 0.11603438946672277, + "flos": 19463636337120.0, + "grad_norm": 1.8119434619581505, + "language_loss": 0.81664246, + "learning_rate": 3.9229474773891374e-06, + "loss": 0.84758687, + "num_input_tokens_seen": 20428180, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.67773438, + "step": 965, + "time_per_iteration": 2.975733995437622 + }, + { + "auxiliary_loss_clip": 0.01591517, + "auxiliary_loss_mlp": 0.01485035, + "balance_loss_clip": 1.22160411, + "balance_loss_mlp": 1.10814333, + "epoch": 0.11615463235736187, + "flos": 26834327190720.0, + "grad_norm": 3.3098539938066374, + "language_loss": 0.83777559, + "learning_rate": 3.922733194727818e-06, + "loss": 0.86854112, + "num_input_tokens_seen": 20447975, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.765625, + "step": 966, + "time_per_iteration": 2.9993815422058105 + }, + { + "auxiliary_loss_clip": 0.01599465, + "auxiliary_loss_mlp": 0.01515862, + "balance_loss_clip": 1.23005438, + "balance_loss_mlp": 1.15594554, + "epoch": 0.11627487524800097, + "flos": 18581873729760.0, + "grad_norm": 2.4325184681107856, + "language_loss": 0.87563443, + "learning_rate": 3.922518620389402e-06, + "loss": 0.90678775, + "num_input_tokens_seen": 20464840, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.59960938, + "step": 967, + "time_per_iteration": 2.9812681674957275 + }, + { + "auxiliary_loss_clip": 0.01604198, + "auxiliary_loss_mlp": 0.01500688, + "balance_loss_clip": 1.23607993, + "balance_loss_mlp": 1.13123441, + "epoch": 0.11639511813864005, + "flos": 18152446730400.0, + "grad_norm": 2.2460600261064347, + "language_loss": 0.89661849, + "learning_rate": 3.922303754406439e-06, + "loss": 0.92766738, + "num_input_tokens_seen": 20482680, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.69140625, + "step": 968, + "time_per_iteration": 3.050179958343506 + }, + { + "auxiliary_loss_clip": 0.01600083, + "auxiliary_loss_mlp": 0.01474694, + "balance_loss_clip": 1.23074603, + "balance_loss_mlp": 1.10600328, + "epoch": 0.11651536102927915, + "flos": 20924074710720.0, + "grad_norm": 2.345881595712853, + "language_loss": 0.79193264, + "learning_rate": 3.922088596811526e-06, + "loss": 0.82268041, + "num_input_tokens_seen": 20501810, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.68359375, + "step": 969, + "time_per_iteration": 3.096445322036743 + }, + { + "auxiliary_loss_clip": 0.01594535, + "auxiliary_loss_mlp": 0.01510595, + "balance_loss_clip": 1.22474933, + "balance_loss_mlp": 1.13828015, + "epoch": 0.11663560391991823, + "flos": 16510482361440.0, + "grad_norm": 2.2313877291732096, + "language_loss": 0.86875904, + "learning_rate": 3.9218731476373e-06, + "loss": 0.89981037, + "num_input_tokens_seen": 20517995, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.72070312, + "step": 970, + "time_per_iteration": 3.008643627166748 + }, + { + "auxiliary_loss_clip": 0.01599498, + "auxiliary_loss_mlp": 0.01477695, + "balance_loss_clip": 1.22930694, + "balance_loss_mlp": 1.09412766, + "epoch": 0.11675584681055733, + "flos": 19867082182560.0, + "grad_norm": 2.3419627305157307, + "language_loss": 0.84908891, + "learning_rate": 3.9216574069164455e-06, + "loss": 0.87986088, + "num_input_tokens_seen": 20536970, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.83398438, + "step": 971, + "time_per_iteration": 3.0071961879730225 + }, + { + "auxiliary_loss_clip": 0.01596799, + "auxiliary_loss_mlp": 0.01473583, + "balance_loss_clip": 1.22883987, + "balance_loss_mlp": 1.0924952, + "epoch": 0.11687608970119642, + "flos": 21946514320800.0, + "grad_norm": 1.8513180779904996, + "language_loss": 0.80233985, + "learning_rate": 3.921441374681691e-06, + "loss": 0.83304369, + "num_input_tokens_seen": 20557030, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.80664062, + "step": 972, + "time_per_iteration": 3.147735595703125 + }, + { + "auxiliary_loss_clip": 0.01592541, + "auxiliary_loss_mlp": 0.01488725, + "balance_loss_clip": 1.22383595, + "balance_loss_mlp": 1.10954428, + "epoch": 0.1169963325918355, + "flos": 24063457773600.0, + "grad_norm": 2.2515716721052454, + "language_loss": 0.6477704, + "learning_rate": 3.921225050965808e-06, + "loss": 0.67858303, + "num_input_tokens_seen": 20576915, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.79101562, + "step": 973, + "time_per_iteration": 3.0743777751922607 + }, + { + "auxiliary_loss_clip": 0.01600794, + "auxiliary_loss_mlp": 0.01473065, + "balance_loss_clip": 1.23182106, + "balance_loss_mlp": 1.09464717, + "epoch": 0.1171165754824746, + "flos": 23370427512000.0, + "grad_norm": 2.1983291892300407, + "language_loss": 0.75289065, + "learning_rate": 3.921008435801612e-06, + "loss": 0.7836293, + "num_input_tokens_seen": 20596000, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.78125, + "step": 974, + "time_per_iteration": 3.026371479034424 + }, + { + "auxiliary_loss_clip": 0.01598416, + "auxiliary_loss_mlp": 0.01478982, + "balance_loss_clip": 1.22807598, + "balance_loss_mlp": 1.10533249, + "epoch": 0.11723681837311369, + "flos": 18554261664960.0, + "grad_norm": 2.252721929711664, + "language_loss": 0.75831354, + "learning_rate": 3.920791529221963e-06, + "loss": 0.78908753, + "num_input_tokens_seen": 20614675, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.73242188, + "step": 975, + "time_per_iteration": 3.203629732131958 + }, + { + "auxiliary_loss_clip": 0.01597204, + "auxiliary_loss_mlp": 0.01496573, + "balance_loss_clip": 1.22772908, + "balance_loss_mlp": 1.1271193, + "epoch": 0.11735706126375278, + "flos": 23552825855040.0, + "grad_norm": 2.798195775684378, + "language_loss": 0.763955, + "learning_rate": 3.920574331259768e-06, + "loss": 0.79489279, + "num_input_tokens_seen": 20635875, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.69335938, + "step": 976, + "time_per_iteration": 3.112847328186035 + }, + { + "auxiliary_loss_clip": 0.01604451, + "auxiliary_loss_mlp": 0.01469079, + "balance_loss_clip": 1.23578882, + "balance_loss_mlp": 1.091043, + "epoch": 0.11747730415439187, + "flos": 22383678664800.0, + "grad_norm": 3.1573958974369862, + "language_loss": 0.7957592, + "learning_rate": 3.9203568419479716e-06, + "loss": 0.82649446, + "num_input_tokens_seen": 20656430, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.77539062, + "step": 977, + "time_per_iteration": 2.943866729736328 + }, + { + "auxiliary_loss_clip": 0.01601834, + "auxiliary_loss_mlp": 0.01497106, + "balance_loss_clip": 1.23318243, + "balance_loss_mlp": 1.12631726, + "epoch": 0.11759754704503096, + "flos": 22202949160800.0, + "grad_norm": 2.1761351959480213, + "language_loss": 0.75409615, + "learning_rate": 3.92013906131957e-06, + "loss": 0.78508556, + "num_input_tokens_seen": 20675360, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.70507812, + "step": 978, + "time_per_iteration": 2.9949817657470703 + }, + { + "auxiliary_loss_clip": 0.01602473, + "auxiliary_loss_mlp": 0.01465093, + "balance_loss_clip": 1.23382163, + "balance_loss_mlp": 1.08705604, + "epoch": 0.11771778993567006, + "flos": 22311955785600.0, + "grad_norm": 1.704922979723576, + "language_loss": 0.82943815, + "learning_rate": 3.9199209894076e-06, + "loss": 0.86011386, + "num_input_tokens_seen": 20695675, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.77734375, + "step": 979, + "time_per_iteration": 4.008862733840942 + }, + { + "auxiliary_loss_clip": 0.01596428, + "auxiliary_loss_mlp": 0.01476005, + "balance_loss_clip": 1.2260313, + "balance_loss_mlp": 1.10178328, + "epoch": 0.11783803282630914, + "flos": 21290312666880.0, + "grad_norm": 2.4869130083528748, + "language_loss": 0.90139794, + "learning_rate": 3.919702626245142e-06, + "loss": 0.93212223, + "num_input_tokens_seen": 20715330, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.73828125, + "step": 980, + "time_per_iteration": 2.970205307006836 + }, + { + "auxiliary_loss_clip": 0.01597435, + "auxiliary_loss_mlp": 0.01489808, + "balance_loss_clip": 1.22834921, + "balance_loss_mlp": 1.10872006, + "epoch": 0.11795827571694824, + "flos": 25373737104480.0, + "grad_norm": 3.0803536825756197, + "language_loss": 0.65947986, + "learning_rate": 3.919483971865322e-06, + "loss": 0.69035232, + "num_input_tokens_seen": 20735325, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.80664062, + "step": 981, + "time_per_iteration": 4.724695444107056 + }, + { + "auxiliary_loss_clip": 0.01603759, + "auxiliary_loss_mlp": 0.01497381, + "balance_loss_clip": 1.23451424, + "balance_loss_mlp": 1.12697387, + "epoch": 0.11807851860758732, + "flos": 23624434949760.0, + "grad_norm": 2.0694987368034035, + "language_loss": 0.88289738, + "learning_rate": 3.91926502630131e-06, + "loss": 0.91390884, + "num_input_tokens_seen": 20755940, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.703125, + "step": 982, + "time_per_iteration": 3.0005805492401123 + }, + { + "auxiliary_loss_clip": 0.01602833, + "auxiliary_loss_mlp": 0.01497309, + "balance_loss_clip": 1.23554814, + "balance_loss_mlp": 1.1270926, + "epoch": 0.11819876149822642, + "flos": 24974804710080.0, + "grad_norm": 2.1123983854476793, + "language_loss": 0.72154534, + "learning_rate": 3.91904578958632e-06, + "loss": 0.75254679, + "num_input_tokens_seen": 20775355, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.69921875, + "step": 983, + "time_per_iteration": 3.02645206451416 + }, + { + "auxiliary_loss_clip": 0.01601181, + "auxiliary_loss_mlp": 0.01495957, + "balance_loss_clip": 1.23310137, + "balance_loss_mlp": 1.11849272, + "epoch": 0.11831900438886551, + "flos": 23005479113280.0, + "grad_norm": 2.1397531774190184, + "language_loss": 0.84239244, + "learning_rate": 3.918826261753608e-06, + "loss": 0.87336385, + "num_input_tokens_seen": 20794935, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.77148438, + "step": 984, + "time_per_iteration": 3.891993284225464 + }, + { + "auxiliary_loss_clip": 0.01600431, + "auxiliary_loss_mlp": 0.01484558, + "balance_loss_clip": 1.23271513, + "balance_loss_mlp": 1.10652161, + "epoch": 0.1184392472795046, + "flos": 27967669833600.0, + "grad_norm": 3.5142494044739574, + "language_loss": 0.71368361, + "learning_rate": 3.918606442836478e-06, + "loss": 0.74453348, + "num_input_tokens_seen": 20817155, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.77539062, + "step": 985, + "time_per_iteration": 3.042210102081299 + }, + { + "auxiliary_loss_clip": 0.01604234, + "auxiliary_loss_mlp": 0.01499722, + "balance_loss_clip": 1.23724222, + "balance_loss_mlp": 1.13770747, + "epoch": 0.1185594901701437, + "flos": 19900383471360.0, + "grad_norm": 2.04227443430389, + "language_loss": 0.77482539, + "learning_rate": 3.918386332868277e-06, + "loss": 0.80586493, + "num_input_tokens_seen": 20835125, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.61914062, + "step": 986, + "time_per_iteration": 2.905324697494507 + }, + { + "auxiliary_loss_clip": 0.01604676, + "auxiliary_loss_mlp": 0.01489051, + "balance_loss_clip": 1.23691797, + "balance_loss_mlp": 1.12188649, + "epoch": 0.11867973306078278, + "flos": 18914127690240.0, + "grad_norm": 1.858050050781824, + "language_loss": 0.94418466, + "learning_rate": 3.918165931882394e-06, + "loss": 0.97512197, + "num_input_tokens_seen": 20853525, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.66796875, + "step": 987, + "time_per_iteration": 2.9270424842834473 + }, + { + "auxiliary_loss_clip": 0.01613223, + "auxiliary_loss_mlp": 0.01498938, + "balance_loss_clip": 1.24522471, + "balance_loss_mlp": 1.13158274, + "epoch": 0.11879997595142187, + "flos": 16984702882080.0, + "grad_norm": 3.9077796192856153, + "language_loss": 0.75268406, + "learning_rate": 3.917945239912264e-06, + "loss": 0.78380567, + "num_input_tokens_seen": 20871000, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.67382812, + "step": 988, + "time_per_iteration": 2.9703898429870605 + }, + { + "auxiliary_loss_clip": 0.01621536, + "auxiliary_loss_mlp": 0.01497438, + "balance_loss_clip": 1.25464654, + "balance_loss_mlp": 1.14038205, + "epoch": 0.11892021884206096, + "flos": 17532315120960.0, + "grad_norm": 2.4746174049044876, + "language_loss": 0.75491601, + "learning_rate": 3.917724256991367e-06, + "loss": 0.78610575, + "num_input_tokens_seen": 20889745, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.56835938, + "step": 989, + "time_per_iteration": 2.8786373138427734 + }, + { + "auxiliary_loss_clip": 0.01609323, + "auxiliary_loss_mlp": 0.01474321, + "balance_loss_clip": 1.24306989, + "balance_loss_mlp": 1.10658455, + "epoch": 0.11904046173270005, + "flos": 30958524764640.0, + "grad_norm": 2.189778358484607, + "language_loss": 0.81725109, + "learning_rate": 3.9175029831532245e-06, + "loss": 0.84808755, + "num_input_tokens_seen": 20909260, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.671875, + "step": 990, + "time_per_iteration": 3.0337648391723633 + }, + { + "auxiliary_loss_clip": 0.01614418, + "auxiliary_loss_mlp": 0.01489394, + "balance_loss_clip": 1.24782526, + "balance_loss_mlp": 1.11727047, + "epoch": 0.11916070462333915, + "flos": 20159321569920.0, + "grad_norm": 2.3795298348399103, + "language_loss": 0.88717037, + "learning_rate": 3.917281418431404e-06, + "loss": 0.91820848, + "num_input_tokens_seen": 20928305, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 3.72070312, + "step": 991, + "time_per_iteration": 2.9247612953186035 + }, + { + "auxiliary_loss_clip": 0.01613055, + "auxiliary_loss_mlp": 0.01496766, + "balance_loss_clip": 1.24733102, + "balance_loss_mlp": 1.12845695, + "epoch": 0.11928094751397823, + "flos": 23553432705600.0, + "grad_norm": 2.1079148046572818, + "language_loss": 0.77065933, + "learning_rate": 3.917059562859516e-06, + "loss": 0.80175757, + "num_input_tokens_seen": 20947630, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.6796875, + "step": 992, + "time_per_iteration": 2.961560010910034 + }, + { + "auxiliary_loss_clip": 0.01621409, + "auxiliary_loss_mlp": 0.01491707, + "balance_loss_clip": 1.25641298, + "balance_loss_mlp": 1.12645006, + "epoch": 0.11940119040461733, + "flos": 23910340334400.0, + "grad_norm": 10.967077058335676, + "language_loss": 0.88771808, + "learning_rate": 3.916837416471218e-06, + "loss": 0.91884929, + "num_input_tokens_seen": 20964250, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.65234375, + "step": 993, + "time_per_iteration": 3.0644314289093018 + }, + { + "auxiliary_loss_clip": 0.01610035, + "auxiliary_loss_mlp": 0.01500919, + "balance_loss_clip": 1.24317789, + "balance_loss_mlp": 1.12765145, + "epoch": 0.11952143329525641, + "flos": 13846230095040.0, + "grad_norm": 2.38446020253248, + "language_loss": 0.72747403, + "learning_rate": 3.916614979300207e-06, + "loss": 0.75858361, + "num_input_tokens_seen": 20979095, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.73046875, + "step": 994, + "time_per_iteration": 2.9437294006347656 + }, + { + "auxiliary_loss_clip": 0.01615408, + "auxiliary_loss_mlp": 0.01485026, + "balance_loss_clip": 1.2491864, + "balance_loss_mlp": 1.11480963, + "epoch": 0.11964167618589551, + "flos": 27017939234880.0, + "grad_norm": 1.7316223484130007, + "language_loss": 0.78859282, + "learning_rate": 3.9163922513802274e-06, + "loss": 0.81959713, + "num_input_tokens_seen": 21001430, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.69921875, + "step": 995, + "time_per_iteration": 3.0788564682006836 + }, + { + "auxiliary_loss_clip": 0.01599458, + "auxiliary_loss_mlp": 0.01494998, + "balance_loss_clip": 1.23328924, + "balance_loss_mlp": 1.12573493, + "epoch": 0.1197619190765346, + "flos": 12569252052960.0, + "grad_norm": 3.244207182738005, + "language_loss": 0.82924223, + "learning_rate": 3.916169232745067e-06, + "loss": 0.86018682, + "num_input_tokens_seen": 21019105, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.68945312, + "step": 996, + "time_per_iteration": 2.9205424785614014 + }, + { + "auxiliary_loss_clip": 0.01607734, + "auxiliary_loss_mlp": 0.01504632, + "balance_loss_clip": 1.2404108, + "balance_loss_mlp": 1.13956511, + "epoch": 0.11988216196717369, + "flos": 16911311163840.0, + "grad_norm": 4.549370708926486, + "language_loss": 0.92510271, + "learning_rate": 3.915945923428559e-06, + "loss": 0.95622635, + "num_input_tokens_seen": 21035630, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.6484375, + "step": 997, + "time_per_iteration": 2.9320712089538574 + }, + { + "auxiliary_loss_clip": 0.0161011, + "auxiliary_loss_mlp": 0.01490397, + "balance_loss_clip": 1.24368036, + "balance_loss_mlp": 1.11655641, + "epoch": 0.12000240485781279, + "flos": 16218432614880.0, + "grad_norm": 2.291553361334355, + "language_loss": 0.83206904, + "learning_rate": 3.915722323464577e-06, + "loss": 0.86307406, + "num_input_tokens_seen": 21054235, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 3.734375, + "step": 998, + "time_per_iteration": 2.9801483154296875 + }, + { + "auxiliary_loss_clip": 0.01613444, + "auxiliary_loss_mlp": 0.01483061, + "balance_loss_clip": 1.24717641, + "balance_loss_mlp": 1.11627722, + "epoch": 0.12012264774845187, + "flos": 49348556039520.0, + "grad_norm": 4.993412276240558, + "language_loss": 0.70646775, + "learning_rate": 3.91549843288704e-06, + "loss": 0.73743278, + "num_input_tokens_seen": 21077915, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.6640625, + "step": 999, + "time_per_iteration": 3.1308250427246094 + }, + { + "auxiliary_loss_clip": 0.01608622, + "auxiliary_loss_mlp": 0.01477011, + "balance_loss_clip": 1.2422663, + "balance_loss_mlp": 1.11099041, + "epoch": 0.12024289063909097, + "flos": 26981869190400.0, + "grad_norm": 3.335823013962979, + "language_loss": 0.79216099, + "learning_rate": 3.915274251729916e-06, + "loss": 0.8230173, + "num_input_tokens_seen": 21099205, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.65820312, + "step": 1000, + "time_per_iteration": 2.9333255290985107 + }, + { + "auxiliary_loss_clip": 0.01613203, + "auxiliary_loss_mlp": 0.01495294, + "balance_loss_clip": 1.24680114, + "balance_loss_mlp": 1.13080001, + "epoch": 0.12036313352973005, + "flos": 19539379601280.0, + "grad_norm": 14.742550595474695, + "language_loss": 0.90292621, + "learning_rate": 3.91504978002721e-06, + "loss": 0.93401116, + "num_input_tokens_seen": 21118260, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.64257812, + "step": 1001, + "time_per_iteration": 2.89632511138916 + }, + { + "auxiliary_loss_clip": 0.01615503, + "auxiliary_loss_mlp": 0.01486387, + "balance_loss_clip": 1.24958181, + "balance_loss_mlp": 1.11998522, + "epoch": 0.12048337642036915, + "flos": 17269811775360.0, + "grad_norm": 2.6137109630528244, + "language_loss": 0.76550364, + "learning_rate": 3.914825017812974e-06, + "loss": 0.79652262, + "num_input_tokens_seen": 21134910, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.66210938, + "step": 1002, + "time_per_iteration": 2.9037179946899414 + }, + { + "auxiliary_loss_clip": 0.01620829, + "auxiliary_loss_mlp": 0.01491997, + "balance_loss_clip": 1.25501049, + "balance_loss_mlp": 1.12406945, + "epoch": 0.12060361931100824, + "flos": 22859188742880.0, + "grad_norm": 2.6392440603958907, + "language_loss": 0.72911757, + "learning_rate": 3.9145999651213065e-06, + "loss": 0.7602458, + "num_input_tokens_seen": 21154150, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.67773438, + "step": 1003, + "time_per_iteration": 2.905503034591675 + }, + { + "auxiliary_loss_clip": 0.01603112, + "auxiliary_loss_mlp": 0.01502506, + "balance_loss_clip": 1.23651981, + "balance_loss_mlp": 1.12942851, + "epoch": 0.12072386220164733, + "flos": 16728647323680.0, + "grad_norm": 2.9983000628483674, + "language_loss": 0.8830663, + "learning_rate": 3.9143746219863465e-06, + "loss": 0.91412246, + "num_input_tokens_seen": 21171255, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.72851562, + "step": 1004, + "time_per_iteration": 2.911916971206665 + }, + { + "auxiliary_loss_clip": 0.01808233, + "auxiliary_loss_mlp": 0.01465538, + "balance_loss_clip": 1.45390427, + "balance_loss_mlp": 1.10695648, + "epoch": 0.12084410509228642, + "flos": 55150104955680.0, + "grad_norm": 0.9723024979977121, + "language_loss": 0.64714372, + "learning_rate": 3.914148988442278e-06, + "loss": 0.67988145, + "num_input_tokens_seen": 21227045, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 3.5859375, + "step": 1005, + "time_per_iteration": 3.44449782371521 + }, + { + "auxiliary_loss_clip": 0.01617543, + "auxiliary_loss_mlp": 0.01488454, + "balance_loss_clip": 1.25001407, + "balance_loss_mlp": 1.11213422, + "epoch": 0.1209643479829255, + "flos": 26762642239680.0, + "grad_norm": 3.137123284939273, + "language_loss": 0.95405197, + "learning_rate": 3.91392306452333e-06, + "loss": 0.98511195, + "num_input_tokens_seen": 21244120, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.75976562, + "step": 1006, + "time_per_iteration": 3.0644078254699707 + }, + { + "auxiliary_loss_clip": 0.01615939, + "auxiliary_loss_mlp": 0.01499099, + "balance_loss_clip": 1.24893641, + "balance_loss_mlp": 1.12544978, + "epoch": 0.1210845908735646, + "flos": 11036559805920.0, + "grad_norm": 3.052368521353159, + "language_loss": 0.66565573, + "learning_rate": 3.913696850263774e-06, + "loss": 0.69680619, + "num_input_tokens_seen": 21258485, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.73242188, + "step": 1007, + "time_per_iteration": 3.759110689163208 + }, + { + "auxiliary_loss_clip": 0.0160773, + "auxiliary_loss_mlp": 0.01489534, + "balance_loss_clip": 1.24172413, + "balance_loss_mlp": 1.11531258, + "epoch": 0.1212048337642037, + "flos": 20486682797760.0, + "grad_norm": 2.3447689422841727, + "language_loss": 0.79344881, + "learning_rate": 3.913470345697929e-06, + "loss": 0.82442141, + "num_input_tokens_seen": 21277115, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.74023438, + "step": 1008, + "time_per_iteration": 3.0354819297790527 + }, + { + "auxiliary_loss_clip": 0.01618062, + "auxiliary_loss_mlp": 0.01484365, + "balance_loss_clip": 1.25093985, + "balance_loss_mlp": 1.10957074, + "epoch": 0.12132507665484278, + "flos": 22348253399040.0, + "grad_norm": 2.5839171669350973, + "language_loss": 0.85704523, + "learning_rate": 3.913243550860153e-06, + "loss": 0.88806951, + "num_input_tokens_seen": 21294880, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.74414062, + "step": 1009, + "time_per_iteration": 4.6241559982299805 + }, + { + "auxiliary_loss_clip": 0.01617328, + "auxiliary_loss_mlp": 0.01475452, + "balance_loss_clip": 1.25057352, + "balance_loss_mlp": 1.10313725, + "epoch": 0.12144531954548188, + "flos": 29317546527840.0, + "grad_norm": 2.408001345022169, + "language_loss": 0.76151645, + "learning_rate": 3.913016465784852e-06, + "loss": 0.79244435, + "num_input_tokens_seen": 21315555, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.71875, + "step": 1010, + "time_per_iteration": 2.986823081970215 + }, + { + "auxiliary_loss_clip": 0.01616837, + "auxiliary_loss_mlp": 0.01488197, + "balance_loss_clip": 1.24838603, + "balance_loss_mlp": 1.11702681, + "epoch": 0.12156556243612096, + "flos": 20487327576480.0, + "grad_norm": 3.125602825589992, + "language_loss": 0.71270299, + "learning_rate": 3.912789090506474e-06, + "loss": 0.74375325, + "num_input_tokens_seen": 21334815, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.70898438, + "step": 1011, + "time_per_iteration": 2.9875576496124268 + }, + { + "auxiliary_loss_clip": 0.01610148, + "auxiliary_loss_mlp": 0.01506203, + "balance_loss_clip": 1.24220967, + "balance_loss_mlp": 1.13312614, + "epoch": 0.12168580532676006, + "flos": 16473881322720.0, + "grad_norm": 2.1428427014990756, + "language_loss": 0.71942729, + "learning_rate": 3.9125614250595114e-06, + "loss": 0.7505908, + "num_input_tokens_seen": 21351025, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.7265625, + "step": 1012, + "time_per_iteration": 2.9684178829193115 + }, + { + "auxiliary_loss_clip": 0.01614861, + "auxiliary_loss_mlp": 0.01496751, + "balance_loss_clip": 1.24591231, + "balance_loss_mlp": 1.13130283, + "epoch": 0.12180604821739914, + "flos": 15343117794720.0, + "grad_norm": 2.6135276773490994, + "language_loss": 0.88852817, + "learning_rate": 3.912333469478502e-06, + "loss": 0.91964424, + "num_input_tokens_seen": 21368990, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.65234375, + "step": 1013, + "time_per_iteration": 3.766427755355835 + }, + { + "auxiliary_loss_clip": 0.0161199, + "auxiliary_loss_mlp": 0.01497757, + "balance_loss_clip": 1.24404514, + "balance_loss_mlp": 1.13669574, + "epoch": 0.12192629110803824, + "flos": 19320076794240.0, + "grad_norm": 3.0823365526709865, + "language_loss": 0.77883321, + "learning_rate": 3.912105223798025e-06, + "loss": 0.80993068, + "num_input_tokens_seen": 21388410, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.60742188, + "step": 1014, + "time_per_iteration": 2.988960027694702 + }, + { + "auxiliary_loss_clip": 0.01787978, + "auxiliary_loss_mlp": 0.01438194, + "balance_loss_clip": 1.43032932, + "balance_loss_mlp": 1.11470795, + "epoch": 0.12204653399867733, + "flos": 47730182986080.0, + "grad_norm": 0.9984503345680673, + "language_loss": 0.6765793, + "learning_rate": 3.9118766880527065e-06, + "loss": 0.70884103, + "num_input_tokens_seen": 21442845, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 3.234375, + "step": 1015, + "time_per_iteration": 3.3739500045776367 + }, + { + "auxiliary_loss_clip": 0.01607625, + "auxiliary_loss_mlp": 0.01493154, + "balance_loss_clip": 1.23877168, + "balance_loss_mlp": 1.12541771, + "epoch": 0.12216677688931642, + "flos": 18223828256160.0, + "grad_norm": 1.8152711220703062, + "language_loss": 0.73792535, + "learning_rate": 3.9116478622772145e-06, + "loss": 0.76893312, + "num_input_tokens_seen": 21461420, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.67578125, + "step": 1016, + "time_per_iteration": 2.9109177589416504 + }, + { + "auxiliary_loss_clip": 0.01615236, + "auxiliary_loss_mlp": 0.01488322, + "balance_loss_clip": 1.24683881, + "balance_loss_mlp": 1.12382793, + "epoch": 0.12228701977995551, + "flos": 27528040159200.0, + "grad_norm": 1.779632648237381, + "language_loss": 0.87975049, + "learning_rate": 3.911418746506261e-06, + "loss": 0.91078615, + "num_input_tokens_seen": 21481550, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.64257812, + "step": 1017, + "time_per_iteration": 3.2039082050323486 + }, + { + "auxiliary_loss_clip": 0.01616263, + "auxiliary_loss_mlp": 0.01501656, + "balance_loss_clip": 1.24842739, + "balance_loss_mlp": 1.13735271, + "epoch": 0.1224072626705946, + "flos": 21800261878560.0, + "grad_norm": 1.9769982680197644, + "language_loss": 0.78791094, + "learning_rate": 3.911189340774604e-06, + "loss": 0.81909013, + "num_input_tokens_seen": 21501680, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.64257812, + "step": 1018, + "time_per_iteration": 3.0517091751098633 + }, + { + "auxiliary_loss_clip": 0.01615906, + "auxiliary_loss_mlp": 0.01483644, + "balance_loss_clip": 1.24822569, + "balance_loss_mlp": 1.12143826, + "epoch": 0.1225275055612337, + "flos": 20705833892160.0, + "grad_norm": 2.0202644525032873, + "language_loss": 0.79660428, + "learning_rate": 3.910959645117043e-06, + "loss": 0.82759976, + "num_input_tokens_seen": 21521015, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.62109375, + "step": 1019, + "time_per_iteration": 2.9953811168670654 + }, + { + "auxiliary_loss_clip": 0.01770419, + "auxiliary_loss_mlp": 0.01438003, + "balance_loss_clip": 1.41322255, + "balance_loss_mlp": 1.11451721, + "epoch": 0.12264774845187278, + "flos": 57751964670240.0, + "grad_norm": 0.8263681772386899, + "language_loss": 0.56680477, + "learning_rate": 3.910729659568423e-06, + "loss": 0.59888899, + "num_input_tokens_seen": 21578200, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 3.234375, + "step": 1020, + "time_per_iteration": 3.5020720958709717 + }, + { + "auxiliary_loss_clip": 0.01619813, + "auxiliary_loss_mlp": 0.01496814, + "balance_loss_clip": 1.25266957, + "balance_loss_mlp": 1.13479948, + "epoch": 0.12276799134251187, + "flos": 26398717901280.0, + "grad_norm": 1.8600400544574522, + "language_loss": 0.82526159, + "learning_rate": 3.9104993841636344e-06, + "loss": 0.85642791, + "num_input_tokens_seen": 21598770, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.61914062, + "step": 1021, + "time_per_iteration": 2.945542573928833 + }, + { + "auxiliary_loss_clip": 0.01618014, + "auxiliary_loss_mlp": 0.01494444, + "balance_loss_clip": 1.24952841, + "balance_loss_mlp": 1.13433707, + "epoch": 0.12288823423315097, + "flos": 21066041270880.0, + "grad_norm": 1.7949718624061732, + "language_loss": 0.80774856, + "learning_rate": 3.910268818937608e-06, + "loss": 0.83887315, + "num_input_tokens_seen": 21616925, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.6015625, + "step": 1022, + "time_per_iteration": 2.9753029346466064 + }, + { + "auxiliary_loss_clip": 0.01624784, + "auxiliary_loss_mlp": 0.01498985, + "balance_loss_clip": 1.25575089, + "balance_loss_mlp": 1.1415478, + "epoch": 0.12300847712379005, + "flos": 12314068842240.0, + "grad_norm": 2.7648479771176246, + "language_loss": 0.878775, + "learning_rate": 3.9100379639253196e-06, + "loss": 0.91001272, + "num_input_tokens_seen": 21633645, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.57226562, + "step": 1023, + "time_per_iteration": 2.8887815475463867 + }, + { + "auxiliary_loss_clip": 0.01608994, + "auxiliary_loss_mlp": 0.01491603, + "balance_loss_clip": 1.23936033, + "balance_loss_mlp": 1.12997019, + "epoch": 0.12312872001442915, + "flos": 16764034661280.0, + "grad_norm": 3.152293938450886, + "language_loss": 0.86300832, + "learning_rate": 3.909806819161791e-06, + "loss": 0.8940143, + "num_input_tokens_seen": 21649120, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.61914062, + "step": 1024, + "time_per_iteration": 2.9511148929595947 + }, + { + "auxiliary_loss_clip": 0.01600317, + "auxiliary_loss_mlp": 0.01482896, + "balance_loss_clip": 1.23208213, + "balance_loss_mlp": 1.11897421, + "epoch": 0.12324896290506823, + "flos": 18406871377920.0, + "grad_norm": 2.3134475318827454, + "language_loss": 0.86507833, + "learning_rate": 3.909575384682086e-06, + "loss": 0.8959105, + "num_input_tokens_seen": 21668000, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.63867188, + "step": 1025, + "time_per_iteration": 2.9319355487823486 + }, + { + "auxiliary_loss_clip": 0.01609239, + "auxiliary_loss_mlp": 0.0150495, + "balance_loss_clip": 1.24027991, + "balance_loss_mlp": 1.13664114, + "epoch": 0.12336920579570733, + "flos": 18917351583840.0, + "grad_norm": 2.0918452210501606, + "language_loss": 0.69155025, + "learning_rate": 3.9093436605213144e-06, + "loss": 0.72269213, + "num_input_tokens_seen": 21688500, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.68164062, + "step": 1026, + "time_per_iteration": 2.930551052093506 + }, + { + "auxiliary_loss_clip": 0.01604741, + "auxiliary_loss_mlp": 0.01490081, + "balance_loss_clip": 1.23611355, + "balance_loss_mlp": 1.12596762, + "epoch": 0.12348944868634643, + "flos": 23880642220800.0, + "grad_norm": 5.9162732404497875, + "language_loss": 0.79525208, + "learning_rate": 3.909111646714627e-06, + "loss": 0.82620031, + "num_input_tokens_seen": 21709345, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.63867188, + "step": 1027, + "time_per_iteration": 3.084709405899048 + }, + { + "auxiliary_loss_clip": 0.0160162, + "auxiliary_loss_mlp": 0.01476144, + "balance_loss_clip": 1.23249102, + "balance_loss_mlp": 1.102494, + "epoch": 0.12360969157698551, + "flos": 19028102904000.0, + "grad_norm": 2.9411742467461575, + "language_loss": 0.728239, + "learning_rate": 3.9088793432972206e-06, + "loss": 0.75901663, + "num_input_tokens_seen": 21728165, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.734375, + "step": 1028, + "time_per_iteration": 2.9221291542053223 + }, + { + "auxiliary_loss_clip": 0.01600472, + "auxiliary_loss_mlp": 0.01471033, + "balance_loss_clip": 1.23248398, + "balance_loss_mlp": 1.08803701, + "epoch": 0.1237299344676246, + "flos": 13226288126400.0, + "grad_norm": 2.3286187306620247, + "language_loss": 0.82289934, + "learning_rate": 3.908646750304336e-06, + "loss": 0.85361433, + "num_input_tokens_seen": 21745850, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.828125, + "step": 1029, + "time_per_iteration": 2.9634339809417725 + }, + { + "auxiliary_loss_clip": 0.01603419, + "auxiliary_loss_mlp": 0.0149302, + "balance_loss_clip": 1.23585129, + "balance_loss_mlp": 1.12032461, + "epoch": 0.12385017735826369, + "flos": 20487782714400.0, + "grad_norm": 2.41483909068282, + "language_loss": 0.87263334, + "learning_rate": 3.908413867771257e-06, + "loss": 0.90359771, + "num_input_tokens_seen": 21764760, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.72460938, + "step": 1030, + "time_per_iteration": 2.977562427520752 + }, + { + "auxiliary_loss_clip": 0.01599358, + "auxiliary_loss_mlp": 0.01494458, + "balance_loss_clip": 1.23190355, + "balance_loss_mlp": 1.1208086, + "epoch": 0.12397042024890279, + "flos": 17349765065280.0, + "grad_norm": 2.000223203909374, + "language_loss": 0.80956244, + "learning_rate": 3.908180695733311e-06, + "loss": 0.84050059, + "num_input_tokens_seen": 21784250, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.73242188, + "step": 1031, + "time_per_iteration": 2.994879722595215 + }, + { + "auxiliary_loss_clip": 0.01602257, + "auxiliary_loss_mlp": 0.0149107, + "balance_loss_clip": 1.23462296, + "balance_loss_mlp": 1.11436903, + "epoch": 0.12409066313954187, + "flos": 20414428924320.0, + "grad_norm": 1.9473276821198062, + "language_loss": 0.82918173, + "learning_rate": 3.907947234225871e-06, + "loss": 0.86011499, + "num_input_tokens_seen": 21803260, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.76367188, + "step": 1032, + "time_per_iteration": 3.011925458908081 + }, + { + "auxiliary_loss_clip": 0.0160427, + "auxiliary_loss_mlp": 0.01489907, + "balance_loss_clip": 1.2372818, + "balance_loss_mlp": 1.11549413, + "epoch": 0.12421090603018096, + "flos": 20738566258560.0, + "grad_norm": 3.0419222619354764, + "language_loss": 0.87376249, + "learning_rate": 3.907713483284352e-06, + "loss": 0.90470427, + "num_input_tokens_seen": 21822735, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.74414062, + "step": 1033, + "time_per_iteration": 2.988555431365967 + }, + { + "auxiliary_loss_clip": 0.01600218, + "auxiliary_loss_mlp": 0.01471934, + "balance_loss_clip": 1.23248112, + "balance_loss_mlp": 1.09504163, + "epoch": 0.12433114892082006, + "flos": 24501153111840.0, + "grad_norm": 2.7727202177193337, + "language_loss": 0.97364974, + "learning_rate": 3.907479442944216e-06, + "loss": 1.00437129, + "num_input_tokens_seen": 21841140, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.765625, + "step": 1034, + "time_per_iteration": 2.9711294174194336 + }, + { + "auxiliary_loss_clip": 0.01609205, + "auxiliary_loss_mlp": 0.0147988, + "balance_loss_clip": 1.24200869, + "balance_loss_mlp": 1.10775626, + "epoch": 0.12445139181145914, + "flos": 19684342486080.0, + "grad_norm": 2.323513427143275, + "language_loss": 0.92740244, + "learning_rate": 3.907245113240963e-06, + "loss": 0.95829326, + "num_input_tokens_seen": 21859260, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.71679688, + "step": 1035, + "time_per_iteration": 3.8947432041168213 + }, + { + "auxiliary_loss_clip": 0.015998, + "auxiliary_loss_mlp": 0.01488972, + "balance_loss_clip": 1.23091531, + "balance_loss_mlp": 1.11570442, + "epoch": 0.12457163470209824, + "flos": 46426427663040.0, + "grad_norm": 2.00642944541556, + "language_loss": 0.74256027, + "learning_rate": 3.907010494210144e-06, + "loss": 0.77344805, + "num_input_tokens_seen": 21881920, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.73046875, + "step": 1036, + "time_per_iteration": 3.1906135082244873 + }, + { + "auxiliary_loss_clip": 0.01602818, + "auxiliary_loss_mlp": 0.01482193, + "balance_loss_clip": 1.23456538, + "balance_loss_mlp": 1.11350226, + "epoch": 0.12469187759273732, + "flos": 20378396808000.0, + "grad_norm": 2.1619702617712524, + "language_loss": 0.91855085, + "learning_rate": 3.9067755858873495e-06, + "loss": 0.94940096, + "num_input_tokens_seen": 21898720, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.68554688, + "step": 1037, + "time_per_iteration": 4.690903663635254 + }, + { + "auxiliary_loss_clip": 0.01760786, + "auxiliary_loss_mlp": 0.01475327, + "balance_loss_clip": 1.4100877, + "balance_loss_mlp": 1.12284851, + "epoch": 0.12481212048337642, + "flos": 69231188767680.0, + "grad_norm": 0.9206651513997837, + "language_loss": 0.62737119, + "learning_rate": 3.906540388308214e-06, + "loss": 0.65973228, + "num_input_tokens_seen": 21958305, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 3.53125, + "step": 1038, + "time_per_iteration": 3.4300365447998047 + }, + { + "auxiliary_loss_clip": 0.01604692, + "auxiliary_loss_mlp": 0.01497009, + "balance_loss_clip": 1.2373122, + "balance_loss_mlp": 1.13041651, + "epoch": 0.12493236337401552, + "flos": 18225686736000.0, + "grad_norm": 2.853324435758886, + "language_loss": 0.81651139, + "learning_rate": 3.906304901508417e-06, + "loss": 0.84752846, + "num_input_tokens_seen": 21977205, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.66210938, + "step": 1039, + "time_per_iteration": 2.9339380264282227 + }, + { + "auxiliary_loss_clip": 0.01610254, + "auxiliary_loss_mlp": 0.01501381, + "balance_loss_clip": 1.24409986, + "balance_loss_mlp": 1.13688636, + "epoch": 0.12505260626465461, + "flos": 30046988187360.0, + "grad_norm": 2.253290366770086, + "language_loss": 0.75502431, + "learning_rate": 3.9060691255236835e-06, + "loss": 0.78614068, + "num_input_tokens_seen": 21997770, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.64453125, + "step": 1040, + "time_per_iteration": 3.828115463256836 + }, + { + "auxiliary_loss_clip": 0.01606818, + "auxiliary_loss_mlp": 0.01500979, + "balance_loss_clip": 1.23981905, + "balance_loss_mlp": 1.14144397, + "epoch": 0.1251728491552937, + "flos": 24436750367520.0, + "grad_norm": 1.78698983186382, + "language_loss": 0.80600703, + "learning_rate": 3.905833060389778e-06, + "loss": 0.83708501, + "num_input_tokens_seen": 22021890, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.59570312, + "step": 1041, + "time_per_iteration": 3.0177223682403564 + }, + { + "auxiliary_loss_clip": 0.01608655, + "auxiliary_loss_mlp": 0.01519928, + "balance_loss_clip": 1.24303412, + "balance_loss_mlp": 1.16535163, + "epoch": 0.12529309204593278, + "flos": 27121939342560.0, + "grad_norm": 2.6910109332648, + "language_loss": 0.78620291, + "learning_rate": 3.905596706142513e-06, + "loss": 0.81748879, + "num_input_tokens_seen": 22043300, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.54882812, + "step": 1042, + "time_per_iteration": 3.023225784301758 + }, + { + "auxiliary_loss_clip": 0.01599932, + "auxiliary_loss_mlp": 0.01501161, + "balance_loss_clip": 1.23373222, + "balance_loss_mlp": 1.14372349, + "epoch": 0.12541333493657186, + "flos": 30776809128480.0, + "grad_norm": 2.5610173374601057, + "language_loss": 0.8628453, + "learning_rate": 3.9053600628177435e-06, + "loss": 0.89385623, + "num_input_tokens_seen": 22062910, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 3.57421875, + "step": 1043, + "time_per_iteration": 2.9699606895446777 + }, + { + "auxiliary_loss_clip": 0.01606621, + "auxiliary_loss_mlp": 0.01527391, + "balance_loss_clip": 1.24073136, + "balance_loss_mlp": 1.17796445, + "epoch": 0.12553357782721097, + "flos": 23661832479840.0, + "grad_norm": 2.156438596716421, + "language_loss": 0.8519218, + "learning_rate": 3.905123130451367e-06, + "loss": 0.88326192, + "num_input_tokens_seen": 22084010, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.49609375, + "step": 1044, + "time_per_iteration": 3.0027527809143066 + }, + { + "auxiliary_loss_clip": 0.01610946, + "auxiliary_loss_mlp": 0.01541835, + "balance_loss_clip": 1.24593568, + "balance_loss_mlp": 1.1954608, + "epoch": 0.12565382071785006, + "flos": 24866101510560.0, + "grad_norm": 1.845194136066057, + "language_loss": 0.79413158, + "learning_rate": 3.904885909079326e-06, + "loss": 0.82565939, + "num_input_tokens_seen": 22102795, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.46679688, + "step": 1045, + "time_per_iteration": 2.984880208969116 + }, + { + "auxiliary_loss_clip": 0.01600437, + "auxiliary_loss_mlp": 0.01484694, + "balance_loss_clip": 1.23544455, + "balance_loss_mlp": 1.12821054, + "epoch": 0.12577406360848914, + "flos": 21362907893760.0, + "grad_norm": 3.129965418978413, + "language_loss": 0.77996325, + "learning_rate": 3.904648398737607e-06, + "loss": 0.81081456, + "num_input_tokens_seen": 22121360, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.56445312, + "step": 1046, + "time_per_iteration": 3.032705307006836 + }, + { + "auxiliary_loss_clip": 0.01603749, + "auxiliary_loss_mlp": 0.01514021, + "balance_loss_clip": 1.23774624, + "balance_loss_mlp": 1.1567744, + "epoch": 0.12589430649912825, + "flos": 36141042352320.0, + "grad_norm": 2.201996460988336, + "language_loss": 0.77958983, + "learning_rate": 3.9044105994622406e-06, + "loss": 0.81076753, + "num_input_tokens_seen": 22142505, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 3.5703125, + "step": 1047, + "time_per_iteration": 3.0861995220184326 + }, + { + "auxiliary_loss_clip": 0.01603925, + "auxiliary_loss_mlp": 0.01492747, + "balance_loss_clip": 1.23785865, + "balance_loss_mlp": 1.13359284, + "epoch": 0.12601454938976733, + "flos": 25340018605920.0, + "grad_norm": 2.3514214135453697, + "language_loss": 0.8200922, + "learning_rate": 3.9041725112893005e-06, + "loss": 0.85105896, + "num_input_tokens_seen": 22163730, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 3.59179688, + "step": 1048, + "time_per_iteration": 3.023808002471924 + }, + { + "auxiliary_loss_clip": 0.01601512, + "auxiliary_loss_mlp": 0.01492437, + "balance_loss_clip": 1.23706722, + "balance_loss_mlp": 1.12908697, + "epoch": 0.12613479228040642, + "flos": 15561927535680.0, + "grad_norm": 1.9846846743849347, + "language_loss": 0.74963248, + "learning_rate": 3.903934134254904e-06, + "loss": 0.78057194, + "num_input_tokens_seen": 22181520, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.6328125, + "step": 1049, + "time_per_iteration": 3.065429449081421 + }, + { + "auxiliary_loss_clip": 0.01610517, + "auxiliary_loss_mlp": 0.01481973, + "balance_loss_clip": 1.24490309, + "balance_loss_mlp": 1.11633444, + "epoch": 0.1262550351710455, + "flos": 21472748938080.0, + "grad_norm": 2.3933284840376707, + "language_loss": 0.84976643, + "learning_rate": 3.903695468395213e-06, + "loss": 0.88069135, + "num_input_tokens_seen": 22199390, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.65234375, + "step": 1050, + "time_per_iteration": 2.9764931201934814 + }, + { + "auxiliary_loss_clip": 0.01607206, + "auxiliary_loss_mlp": 0.01495809, + "balance_loss_clip": 1.24412072, + "balance_loss_mlp": 1.12769091, + "epoch": 0.1263752780616846, + "flos": 31579414937280.0, + "grad_norm": 2.3845814441318005, + "language_loss": 0.55547637, + "learning_rate": 3.903456513746434e-06, + "loss": 0.58650655, + "num_input_tokens_seen": 22220365, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.67773438, + "step": 1051, + "time_per_iteration": 3.076458215713501 + }, + { + "auxiliary_loss_clip": 0.01612866, + "auxiliary_loss_mlp": 0.014918, + "balance_loss_clip": 1.24873877, + "balance_loss_mlp": 1.12139285, + "epoch": 0.1264955209523237, + "flos": 28770806636640.0, + "grad_norm": 2.029987169022828, + "language_loss": 0.8773483, + "learning_rate": 3.903217270344815e-06, + "loss": 0.90839493, + "num_input_tokens_seen": 22240615, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.70117188, + "step": 1052, + "time_per_iteration": 3.1127231121063232 + }, + { + "auxiliary_loss_clip": 0.01606108, + "auxiliary_loss_mlp": 0.01475967, + "balance_loss_clip": 1.24177659, + "balance_loss_mlp": 1.0967865, + "epoch": 0.12661576384296278, + "flos": 29243661743520.0, + "grad_norm": 2.1594917002163005, + "language_loss": 0.8222338, + "learning_rate": 3.902977738226648e-06, + "loss": 0.85305452, + "num_input_tokens_seen": 22261350, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.78710938, + "step": 1053, + "time_per_iteration": 3.106227397918701 + }, + { + "auxiliary_loss_clip": 0.01607004, + "auxiliary_loss_mlp": 0.01480615, + "balance_loss_clip": 1.24232638, + "balance_loss_mlp": 1.11020815, + "epoch": 0.12673600673360189, + "flos": 20852465616000.0, + "grad_norm": 2.2523292371272037, + "language_loss": 0.91411901, + "learning_rate": 3.902737917428273e-06, + "loss": 0.94499516, + "num_input_tokens_seen": 22279515, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.703125, + "step": 1054, + "time_per_iteration": 3.191143035888672 + }, + { + "auxiliary_loss_clip": 0.01608545, + "auxiliary_loss_mlp": 0.01488619, + "balance_loss_clip": 1.24410713, + "balance_loss_mlp": 1.10753036, + "epoch": 0.12685624962424097, + "flos": 25265982108960.0, + "grad_norm": 2.4708650419460336, + "language_loss": 0.839656, + "learning_rate": 3.902497807986068e-06, + "loss": 0.87062764, + "num_input_tokens_seen": 22299535, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.80664062, + "step": 1055, + "time_per_iteration": 3.0397908687591553 + }, + { + "auxiliary_loss_clip": 0.0161129, + "auxiliary_loss_mlp": 0.01497214, + "balance_loss_clip": 1.24708533, + "balance_loss_mlp": 1.1174612, + "epoch": 0.12697649251488005, + "flos": 27529595213760.0, + "grad_norm": 2.6323595863610154, + "language_loss": 0.83966553, + "learning_rate": 3.902257409936458e-06, + "loss": 0.87075055, + "num_input_tokens_seen": 22320300, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.79492188, + "step": 1056, + "time_per_iteration": 2.961169481277466 + }, + { + "auxiliary_loss_clip": 0.01611558, + "auxiliary_loss_mlp": 0.01487027, + "balance_loss_clip": 1.24857581, + "balance_loss_mlp": 1.09621096, + "epoch": 0.12709673540551916, + "flos": 21254015053440.0, + "grad_norm": 2.184153244048057, + "language_loss": 0.84207284, + "learning_rate": 3.902016723315912e-06, + "loss": 0.87305868, + "num_input_tokens_seen": 22338240, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.90625, + "step": 1057, + "time_per_iteration": 3.1651792526245117 + }, + { + "auxiliary_loss_clip": 0.01612333, + "auxiliary_loss_mlp": 0.01492699, + "balance_loss_clip": 1.24680722, + "balance_loss_mlp": 1.1024555, + "epoch": 0.12721697829615825, + "flos": 25340056534080.0, + "grad_norm": 3.480991886036075, + "language_loss": 0.69693673, + "learning_rate": 3.901775748160941e-06, + "loss": 0.72798705, + "num_input_tokens_seen": 22357420, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.8984375, + "step": 1058, + "time_per_iteration": 2.9579687118530273 + }, + { + "auxiliary_loss_clip": 0.01774929, + "auxiliary_loss_mlp": 0.01495911, + "balance_loss_clip": 1.42520666, + "balance_loss_mlp": 1.18234253, + "epoch": 0.12733722118679733, + "flos": 61950312525600.0, + "grad_norm": 0.8231886042824594, + "language_loss": 0.60818112, + "learning_rate": 3.901534484508101e-06, + "loss": 0.64088953, + "num_input_tokens_seen": 22420095, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 3.1328125, + "step": 1059, + "time_per_iteration": 3.5888454914093018 + }, + { + "auxiliary_loss_clip": 0.01612168, + "auxiliary_loss_mlp": 0.01491627, + "balance_loss_clip": 1.24711943, + "balance_loss_mlp": 1.1082499, + "epoch": 0.1274574640774364, + "flos": 26979024578400.0, + "grad_norm": 2.147733548930302, + "language_loss": 0.7488265, + "learning_rate": 3.901292932393991e-06, + "loss": 0.77986443, + "num_input_tokens_seen": 22438975, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.828125, + "step": 1060, + "time_per_iteration": 3.1337532997131348 + }, + { + "auxiliary_loss_clip": 0.01612665, + "auxiliary_loss_mlp": 0.01492384, + "balance_loss_clip": 1.24774289, + "balance_loss_mlp": 1.10118735, + "epoch": 0.12757770696807552, + "flos": 22238488211040.0, + "grad_norm": 2.5610064724828923, + "language_loss": 0.85420102, + "learning_rate": 3.9010510918552555e-06, + "loss": 0.88525152, + "num_input_tokens_seen": 22458050, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.91210938, + "step": 1061, + "time_per_iteration": 2.992410182952881 + }, + { + "auxiliary_loss_clip": 0.0161255, + "auxiliary_loss_mlp": 0.01505849, + "balance_loss_clip": 1.24732411, + "balance_loss_mlp": 1.11331654, + "epoch": 0.1276979498587146, + "flos": 28550403912960.0, + "grad_norm": 2.6239358513476914, + "language_loss": 0.74473935, + "learning_rate": 3.900808962928581e-06, + "loss": 0.77592337, + "num_input_tokens_seen": 22475665, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.92382812, + "step": 1062, + "time_per_iteration": 3.0809290409088135 + }, + { + "auxiliary_loss_clip": 0.01615412, + "auxiliary_loss_mlp": 0.01488456, + "balance_loss_clip": 1.25118971, + "balance_loss_mlp": 1.09687734, + "epoch": 0.1278181927493537, + "flos": 17422246507680.0, + "grad_norm": 3.1046069749363006, + "language_loss": 0.89465427, + "learning_rate": 3.900566545650698e-06, + "loss": 0.92569292, + "num_input_tokens_seen": 22493335, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.9140625, + "step": 1063, + "time_per_iteration": 3.841273784637451 + }, + { + "auxiliary_loss_clip": 0.0160693, + "auxiliary_loss_mlp": 0.01496935, + "balance_loss_clip": 1.24261618, + "balance_loss_mlp": 1.10516548, + "epoch": 0.1279384356399928, + "flos": 21140570833920.0, + "grad_norm": 2.6217083246626256, + "language_loss": 0.8189829, + "learning_rate": 3.900323840058381e-06, + "loss": 0.8500216, + "num_input_tokens_seen": 22511045, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.91601562, + "step": 1064, + "time_per_iteration": 3.0772175788879395 + }, + { + "auxiliary_loss_clip": 0.01611895, + "auxiliary_loss_mlp": 0.01481082, + "balance_loss_clip": 1.24735081, + "balance_loss_mlp": 1.08244634, + "epoch": 0.12805867853063188, + "flos": 26579068123680.0, + "grad_norm": 9.073008382415805, + "language_loss": 0.82031167, + "learning_rate": 3.900080846188449e-06, + "loss": 0.85124135, + "num_input_tokens_seen": 22529635, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.98632812, + "step": 1065, + "time_per_iteration": 3.9906537532806396 + }, + { + "auxiliary_loss_clip": 0.01610376, + "auxiliary_loss_mlp": 0.01500292, + "balance_loss_clip": 1.24593866, + "balance_loss_mlp": 1.10432625, + "epoch": 0.12817892142127096, + "flos": 16438114703520.0, + "grad_norm": 1.9227709188443705, + "language_loss": 0.8148374, + "learning_rate": 3.8998375640777625e-06, + "loss": 0.84594411, + "num_input_tokens_seen": 22547505, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.95898438, + "step": 1066, + "time_per_iteration": 3.9365129470825195 + }, + { + "auxiliary_loss_clip": 0.01773359, + "auxiliary_loss_mlp": 0.01426697, + "balance_loss_clip": 1.42151654, + "balance_loss_mlp": 1.0848999, + "epoch": 0.12829916431191005, + "flos": 60762959818560.0, + "grad_norm": 0.724288541955995, + "language_loss": 0.5262652, + "learning_rate": 3.899593993763229e-06, + "loss": 0.55826575, + "num_input_tokens_seen": 22608465, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 3.421875, + "step": 1067, + "time_per_iteration": 3.4046664237976074 + }, + { + "auxiliary_loss_clip": 0.01608796, + "auxiliary_loss_mlp": 0.01485946, + "balance_loss_clip": 1.24483168, + "balance_loss_mlp": 1.10218763, + "epoch": 0.12841940720254916, + "flos": 29789377574400.0, + "grad_norm": 2.5912500503754816, + "language_loss": 0.81663728, + "learning_rate": 3.899350135281796e-06, + "loss": 0.84758466, + "num_input_tokens_seen": 22629465, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 3.83398438, + "step": 1068, + "time_per_iteration": 3.8750593662261963 + }, + { + "auxiliary_loss_clip": 0.01614081, + "auxiliary_loss_mlp": 0.0147302, + "balance_loss_clip": 1.25057542, + "balance_loss_mlp": 1.08468413, + "epoch": 0.12853965009318824, + "flos": 25953892068960.0, + "grad_norm": 2.646243026119053, + "language_loss": 0.79710317, + "learning_rate": 3.8991059886704585e-06, + "loss": 0.8279742, + "num_input_tokens_seen": 22648970, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 3.87890625, + "step": 1069, + "time_per_iteration": 3.0735459327697754 + }, + { + "auxiliary_loss_clip": 0.01616272, + "auxiliary_loss_mlp": 0.01480052, + "balance_loss_clip": 1.25218713, + "balance_loss_mlp": 1.09114385, + "epoch": 0.12865989298382732, + "flos": 30849518139840.0, + "grad_norm": 2.445052197003249, + "language_loss": 0.83154505, + "learning_rate": 3.898861553966252e-06, + "loss": 0.8625083, + "num_input_tokens_seen": 22668620, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.88476562, + "step": 1070, + "time_per_iteration": 3.0178916454315186 + }, + { + "auxiliary_loss_clip": 0.01611787, + "auxiliary_loss_mlp": 0.0148896, + "balance_loss_clip": 1.2500968, + "balance_loss_mlp": 1.11283112, + "epoch": 0.12878013587446643, + "flos": 25888199767200.0, + "grad_norm": 1.8975573748620362, + "language_loss": 0.88247311, + "learning_rate": 3.898616831206257e-06, + "loss": 0.91348052, + "num_input_tokens_seen": 22689045, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 3.75976562, + "step": 1071, + "time_per_iteration": 3.004049777984619 + }, + { + "auxiliary_loss_clip": 0.01602782, + "auxiliary_loss_mlp": 0.01478281, + "balance_loss_clip": 1.23904335, + "balance_loss_mlp": 1.10348701, + "epoch": 0.12890037876510552, + "flos": 23335495312320.0, + "grad_norm": 2.3869612593079546, + "language_loss": 0.77594692, + "learning_rate": 3.8983718204276e-06, + "loss": 0.80675751, + "num_input_tokens_seen": 22711265, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 3.74414062, + "step": 1072, + "time_per_iteration": 3.076990842819214 + }, + { + "auxiliary_loss_clip": 0.01611497, + "auxiliary_loss_mlp": 0.01513231, + "balance_loss_clip": 1.24817228, + "balance_loss_mlp": 1.13748348, + "epoch": 0.1290206216557446, + "flos": 23589540678240.0, + "grad_norm": 2.017275088037765, + "language_loss": 0.82686186, + "learning_rate": 3.898126521667446e-06, + "loss": 0.85810912, + "num_input_tokens_seen": 22731420, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 3.75390625, + "step": 1073, + "time_per_iteration": 2.959205389022827 + }, + { + "auxiliary_loss_clip": 0.01616418, + "auxiliary_loss_mlp": 0.0148004, + "balance_loss_clip": 1.25277901, + "balance_loss_mlp": 1.10448313, + "epoch": 0.12914086454638368, + "flos": 24172919536320.0, + "grad_norm": 1.6799225867730134, + "language_loss": 0.83399135, + "learning_rate": 3.897880934963007e-06, + "loss": 0.8649559, + "num_input_tokens_seen": 22750970, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 3.75195312, + "step": 1074, + "time_per_iteration": 3.0758063793182373 + }, + { + "auxiliary_loss_clip": 0.01603743, + "auxiliary_loss_mlp": 0.01489118, + "balance_loss_clip": 1.24049211, + "balance_loss_mlp": 1.12500572, + "epoch": 0.1292611074370228, + "flos": 20269276398720.0, + "grad_norm": 2.3926522542133366, + "language_loss": 0.78657329, + "learning_rate": 3.89763506035154e-06, + "loss": 0.8175019, + "num_input_tokens_seen": 22768820, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 3.63867188, + "step": 1075, + "time_per_iteration": 3.0632541179656982 + }, + { + "auxiliary_loss_clip": 0.01608222, + "auxiliary_loss_mlp": 0.01492458, + "balance_loss_clip": 1.24581099, + "balance_loss_mlp": 1.13139701, + "epoch": 0.12938135032766188, + "flos": 27379967165280.0, + "grad_norm": 2.549361331306066, + "language_loss": 0.81619585, + "learning_rate": 3.897388897870343e-06, + "loss": 0.84720266, + "num_input_tokens_seen": 22789460, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 3.61132812, + "step": 1076, + "time_per_iteration": 3.0736520290374756 + }, + { + "auxiliary_loss_clip": 0.016058, + "auxiliary_loss_mlp": 0.0149655, + "balance_loss_clip": 1.24283171, + "balance_loss_mlp": 1.13434446, + "epoch": 0.12950159321830096, + "flos": 29279618003520.0, + "grad_norm": 2.4126399406383463, + "language_loss": 0.74739635, + "learning_rate": 3.89714244755676e-06, + "loss": 0.77841985, + "num_input_tokens_seen": 22810820, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.62109375, + "step": 1077, + "time_per_iteration": 3.066366672515869 + }, + { + "auxiliary_loss_clip": 0.01599164, + "auxiliary_loss_mlp": 0.01501624, + "balance_loss_clip": 1.2365526, + "balance_loss_mlp": 1.1403718, + "epoch": 0.12962183610894007, + "flos": 24537185228160.0, + "grad_norm": 3.1788709836825504, + "language_loss": 0.86725438, + "learning_rate": 3.896895709448175e-06, + "loss": 0.8982622, + "num_input_tokens_seen": 22830570, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 3.61132812, + "step": 1078, + "time_per_iteration": 2.9884729385375977 + }, + { + "auxiliary_loss_clip": 0.01602818, + "auxiliary_loss_mlp": 0.01488167, + "balance_loss_clip": 1.24032593, + "balance_loss_mlp": 1.12119269, + "epoch": 0.12974207899957915, + "flos": 11217403094400.0, + "grad_norm": 2.7876683434753153, + "language_loss": 0.77248001, + "learning_rate": 3.896648683582019e-06, + "loss": 0.80338991, + "num_input_tokens_seen": 22845905, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 3.6640625, + "step": 1079, + "time_per_iteration": 2.9974637031555176 + }, + { + "auxiliary_loss_clip": 0.01616625, + "auxiliary_loss_mlp": 0.01499901, + "balance_loss_clip": 1.25286913, + "balance_loss_mlp": 1.14246392, + "epoch": 0.12986232189021824, + "flos": 24720228349920.0, + "grad_norm": 2.347479639944538, + "language_loss": 0.7992208, + "learning_rate": 3.896401369995766e-06, + "loss": 0.83038604, + "num_input_tokens_seen": 22865710, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 3.57421875, + "step": 1080, + "time_per_iteration": 3.024266004562378 + }, + { + "auxiliary_loss_clip": 0.01608202, + "auxiliary_loss_mlp": 0.01511812, + "balance_loss_clip": 1.24531448, + "balance_loss_mlp": 1.15475678, + "epoch": 0.12998256478085732, + "flos": 23917660469280.0, + "grad_norm": 1.8494340261080087, + "language_loss": 0.79677224, + "learning_rate": 3.896153768726932e-06, + "loss": 0.82797241, + "num_input_tokens_seen": 22886020, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.57226562, + "step": 1081, + "time_per_iteration": 2.972041368484497 + }, + { + "auxiliary_loss_clip": 0.01610035, + "auxiliary_loss_mlp": 0.0149687, + "balance_loss_clip": 1.24629676, + "balance_loss_mlp": 1.1277976, + "epoch": 0.13010280767149643, + "flos": 18626174184960.0, + "grad_norm": 2.7839858963898165, + "language_loss": 0.8829627, + "learning_rate": 3.8959058798130806e-06, + "loss": 0.91403174, + "num_input_tokens_seen": 22903995, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 3.6875, + "step": 1082, + "time_per_iteration": 3.051098108291626 + }, + { + "auxiliary_loss_clip": 0.0160535, + "auxiliary_loss_mlp": 0.01491484, + "balance_loss_clip": 1.24071419, + "balance_loss_mlp": 1.12889743, + "epoch": 0.1302230505621355, + "flos": 22786214234400.0, + "grad_norm": 2.532845324212772, + "language_loss": 0.74994588, + "learning_rate": 3.895657703291814e-06, + "loss": 0.78091419, + "num_input_tokens_seen": 22924100, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.62304688, + "step": 1083, + "time_per_iteration": 2.9972403049468994 + }, + { + "auxiliary_loss_clip": 0.01609784, + "auxiliary_loss_mlp": 0.01484216, + "balance_loss_clip": 1.2442975, + "balance_loss_mlp": 1.11895871, + "epoch": 0.1303432934527746, + "flos": 21325700004480.0, + "grad_norm": 4.231225340029963, + "language_loss": 0.80341423, + "learning_rate": 3.895409239200781e-06, + "loss": 0.83435428, + "num_input_tokens_seen": 22939985, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.65039062, + "step": 1084, + "time_per_iteration": 2.9799249172210693 + }, + { + "auxiliary_loss_clip": 0.01609074, + "auxiliary_loss_mlp": 0.01504005, + "balance_loss_clip": 1.24442029, + "balance_loss_mlp": 1.14122748, + "epoch": 0.1304635363434137, + "flos": 20924567776800.0, + "grad_norm": 2.6908377174067226, + "language_loss": 0.91242743, + "learning_rate": 3.895160487577673e-06, + "loss": 0.94355822, + "num_input_tokens_seen": 22957555, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.62304688, + "step": 1085, + "time_per_iteration": 3.0591320991516113 + }, + { + "auxiliary_loss_clip": 0.01805881, + "auxiliary_loss_mlp": 0.01451977, + "balance_loss_clip": 1.45163643, + "balance_loss_mlp": 1.14603806, + "epoch": 0.1305837792340528, + "flos": 63252133876800.0, + "grad_norm": 0.8376908941922292, + "language_loss": 0.60876274, + "learning_rate": 3.894911448460226e-06, + "loss": 0.64134133, + "num_input_tokens_seen": 23016870, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 3.0546875, + "step": 1086, + "time_per_iteration": 3.4299237728118896 + }, + { + "auxiliary_loss_clip": 0.01597909, + "auxiliary_loss_mlp": 0.01486145, + "balance_loss_clip": 1.23393774, + "balance_loss_mlp": 1.10829902, + "epoch": 0.13070402212469187, + "flos": 26431070986080.0, + "grad_norm": 1.9938247454121794, + "language_loss": 0.72442997, + "learning_rate": 3.8946621218862195e-06, + "loss": 0.75527048, + "num_input_tokens_seen": 23037870, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.77539062, + "step": 1087, + "time_per_iteration": 3.150756597518921 + }, + { + "auxiliary_loss_clip": 0.01606101, + "auxiliary_loss_mlp": 0.01472817, + "balance_loss_clip": 1.24185765, + "balance_loss_mlp": 1.09401751, + "epoch": 0.13082426501533098, + "flos": 27675923512320.0, + "grad_norm": 2.885785010845002, + "language_loss": 0.89007568, + "learning_rate": 3.894412507893475e-06, + "loss": 0.92086482, + "num_input_tokens_seen": 23058150, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 3.78515625, + "step": 1088, + "time_per_iteration": 3.1038014888763428 + }, + { + "auxiliary_loss_clip": 0.01611022, + "auxiliary_loss_mlp": 0.01485655, + "balance_loss_clip": 1.24576783, + "balance_loss_mlp": 1.10761833, + "epoch": 0.13094450790597006, + "flos": 24829083262080.0, + "grad_norm": 3.2670883150729733, + "language_loss": 0.72076356, + "learning_rate": 3.894162606519859e-06, + "loss": 0.75173026, + "num_input_tokens_seen": 23077100, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.77929688, + "step": 1089, + "time_per_iteration": 3.0584182739257812 + }, + { + "auxiliary_loss_clip": 0.01605136, + "auxiliary_loss_mlp": 0.01479185, + "balance_loss_clip": 1.24110055, + "balance_loss_mlp": 1.09561729, + "epoch": 0.13106475079660915, + "flos": 19064628086400.0, + "grad_norm": 2.005000422021068, + "language_loss": 0.77177691, + "learning_rate": 3.893912417803282e-06, + "loss": 0.80262011, + "num_input_tokens_seen": 23096815, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.83398438, + "step": 1090, + "time_per_iteration": 3.8205697536468506 + }, + { + "auxiliary_loss_clip": 0.01598584, + "auxiliary_loss_mlp": 0.01474275, + "balance_loss_clip": 1.23435426, + "balance_loss_mlp": 1.08498538, + "epoch": 0.13118499368724823, + "flos": 28915731593280.0, + "grad_norm": 2.726135776860422, + "language_loss": 0.77215302, + "learning_rate": 3.8936619417816975e-06, + "loss": 0.80288166, + "num_input_tokens_seen": 23117145, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.890625, + "step": 1091, + "time_per_iteration": 3.105851888656616 + }, + { + "auxiliary_loss_clip": 0.01603683, + "auxiliary_loss_mlp": 0.01484453, + "balance_loss_clip": 1.24016023, + "balance_loss_mlp": 1.09649873, + "epoch": 0.13130523657788734, + "flos": 14285177062560.0, + "grad_norm": 2.375938529567851, + "language_loss": 0.71536428, + "learning_rate": 3.8934111784931015e-06, + "loss": 0.74624562, + "num_input_tokens_seen": 23134595, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 3.87695312, + "step": 1092, + "time_per_iteration": 2.983687400817871 + }, + { + "auxiliary_loss_clip": 0.01793502, + "auxiliary_loss_mlp": 0.01440063, + "balance_loss_clip": 1.43804252, + "balance_loss_mlp": 1.09750366, + "epoch": 0.13142547946852642, + "flos": 70180805581920.0, + "grad_norm": 0.9201628255856238, + "language_loss": 0.59073079, + "learning_rate": 3.893160127975535e-06, + "loss": 0.62306643, + "num_input_tokens_seen": 23195285, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 3.4296875, + "step": 1093, + "time_per_iteration": 4.477119207382202 + }, + { + "auxiliary_loss_clip": 0.01605347, + "auxiliary_loss_mlp": 0.01478835, + "balance_loss_clip": 1.24082112, + "balance_loss_mlp": 1.10289621, + "epoch": 0.1315457223591655, + "flos": 45809482019040.0, + "grad_norm": 2.7913308229629368, + "language_loss": 0.81344527, + "learning_rate": 3.8929087902670826e-06, + "loss": 0.8442871, + "num_input_tokens_seen": 23216915, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.75390625, + "step": 1094, + "time_per_iteration": 3.168236255645752 + }, + { + "auxiliary_loss_clip": 0.01786336, + "auxiliary_loss_mlp": 0.01435371, + "balance_loss_clip": 1.42901218, + "balance_loss_mlp": 1.10196686, + "epoch": 0.13166596524980462, + "flos": 62887640616000.0, + "grad_norm": 0.9221255512628848, + "language_loss": 0.60630751, + "learning_rate": 3.8926571654058715e-06, + "loss": 0.63852453, + "num_input_tokens_seen": 23273560, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 3.3359375, + "step": 1095, + "time_per_iteration": 4.246360540390015 + }, + { + "auxiliary_loss_clip": 0.01595497, + "auxiliary_loss_mlp": 0.01475689, + "balance_loss_clip": 1.22872436, + "balance_loss_mlp": 1.09421968, + "epoch": 0.1317862081404437, + "flos": 23588782115040.0, + "grad_norm": 3.5184071276263102, + "language_loss": 0.77147555, + "learning_rate": 3.892405253430074e-06, + "loss": 0.80218744, + "num_input_tokens_seen": 23291080, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.81054688, + "step": 1096, + "time_per_iteration": 2.94706130027771 + }, + { + "auxiliary_loss_clip": 0.0159681, + "auxiliary_loss_mlp": 0.01476776, + "balance_loss_clip": 1.2302146, + "balance_loss_mlp": 1.10236359, + "epoch": 0.13190645103108278, + "flos": 20262525186240.0, + "grad_norm": 1.9817606961814538, + "language_loss": 0.8231889, + "learning_rate": 3.892153054377904e-06, + "loss": 0.85392475, + "num_input_tokens_seen": 23308485, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.74023438, + "step": 1097, + "time_per_iteration": 2.9723525047302246 + }, + { + "auxiliary_loss_clip": 0.01769935, + "auxiliary_loss_mlp": 0.01430237, + "balance_loss_clip": 1.41156554, + "balance_loss_mlp": 1.11437988, + "epoch": 0.13202669392172187, + "flos": 53460692094240.0, + "grad_norm": 0.9465408936108102, + "language_loss": 0.5942331, + "learning_rate": 3.891900568287619e-06, + "loss": 0.62623477, + "num_input_tokens_seen": 23360870, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 3.15625, + "step": 1098, + "time_per_iteration": 3.2177562713623047 + }, + { + "auxiliary_loss_clip": 0.01590704, + "auxiliary_loss_mlp": 0.01493743, + "balance_loss_clip": 1.22273099, + "balance_loss_mlp": 1.1260066, + "epoch": 0.13214693681236098, + "flos": 15853104934560.0, + "grad_norm": 2.482970226668715, + "language_loss": 0.72326756, + "learning_rate": 3.891647795197523e-06, + "loss": 0.75411201, + "num_input_tokens_seen": 23376910, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.67382812, + "step": 1099, + "time_per_iteration": 3.115893840789795 + }, + { + "auxiliary_loss_clip": 0.0159014, + "auxiliary_loss_mlp": 0.01481502, + "balance_loss_clip": 1.22363579, + "balance_loss_mlp": 1.11071301, + "epoch": 0.13226717970300006, + "flos": 19355843413440.0, + "grad_norm": 4.457582096680412, + "language_loss": 0.6888181, + "learning_rate": 3.8913947351459605e-06, + "loss": 0.71953452, + "num_input_tokens_seen": 23394450, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 3.703125, + "step": 1100, + "time_per_iteration": 3.096052408218384 + }, + { + "auxiliary_loss_clip": 0.01585298, + "auxiliary_loss_mlp": 0.01484003, + "balance_loss_clip": 1.21814394, + "balance_loss_mlp": 1.11760163, + "epoch": 0.13238742259363914, + "flos": 20699841242880.0, + "grad_norm": 2.08355281487507, + "language_loss": 0.6755957, + "learning_rate": 3.89114138817132e-06, + "loss": 0.70628864, + "num_input_tokens_seen": 23411115, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.66210938, + "step": 1101, + "time_per_iteration": 3.041337251663208 + }, + { + "auxiliary_loss_clip": 0.01592666, + "auxiliary_loss_mlp": 0.01487014, + "balance_loss_clip": 1.22686493, + "balance_loss_mlp": 1.12251973, + "epoch": 0.13250766548427825, + "flos": 21034560533760.0, + "grad_norm": 2.040440135688504, + "language_loss": 0.84224033, + "learning_rate": 3.890887754312035e-06, + "loss": 0.8730371, + "num_input_tokens_seen": 23429360, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.64257812, + "step": 1102, + "time_per_iteration": 3.011591911315918 + }, + { + "auxiliary_loss_clip": 0.0158238, + "auxiliary_loss_mlp": 0.01471523, + "balance_loss_clip": 1.21603894, + "balance_loss_mlp": 1.09615636, + "epoch": 0.13262790837491734, + "flos": 22640075576640.0, + "grad_norm": 2.3314145694840698, + "language_loss": 0.87693578, + "learning_rate": 3.890633833606581e-06, + "loss": 0.90747482, + "num_input_tokens_seen": 23449050, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.75, + "step": 1103, + "time_per_iteration": 2.970207691192627 + }, + { + "auxiliary_loss_clip": 0.01599469, + "auxiliary_loss_mlp": 0.01483009, + "balance_loss_clip": 1.23158431, + "balance_loss_mlp": 1.12156606, + "epoch": 0.13274815126555642, + "flos": 19685214833760.0, + "grad_norm": 2.2345554306245043, + "language_loss": 0.69658542, + "learning_rate": 3.890379626093477e-06, + "loss": 0.7274102, + "num_input_tokens_seen": 23468800, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.61523438, + "step": 1104, + "time_per_iteration": 2.9994122982025146 + }, + { + "auxiliary_loss_clip": 0.01577852, + "auxiliary_loss_mlp": 0.01471866, + "balance_loss_clip": 1.21023905, + "balance_loss_mlp": 1.09611797, + "epoch": 0.1328683941561955, + "flos": 21319441858080.0, + "grad_norm": 3.2053205734039754, + "language_loss": 0.92649734, + "learning_rate": 3.890125131811287e-06, + "loss": 0.95699453, + "num_input_tokens_seen": 23486850, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.75585938, + "step": 1105, + "time_per_iteration": 3.023709774017334 + }, + { + "auxiliary_loss_clip": 0.01589182, + "auxiliary_loss_mlp": 0.01483326, + "balance_loss_clip": 1.22053719, + "balance_loss_mlp": 1.11215639, + "epoch": 0.1329886370468346, + "flos": 13700850000480.0, + "grad_norm": 4.244743929651234, + "language_loss": 0.75386536, + "learning_rate": 3.889870350798618e-06, + "loss": 0.78459048, + "num_input_tokens_seen": 23504195, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.70898438, + "step": 1106, + "time_per_iteration": 2.9493846893310547 + }, + { + "auxiliary_loss_clip": 0.01575535, + "auxiliary_loss_mlp": 0.01488428, + "balance_loss_clip": 1.2073921, + "balance_loss_mlp": 1.12202692, + "epoch": 0.1331088799374737, + "flos": 21034522605600.0, + "grad_norm": 1.662873656076564, + "language_loss": 0.78769964, + "learning_rate": 3.889615283094119e-06, + "loss": 0.81833923, + "num_input_tokens_seen": 23523385, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.66210938, + "step": 1107, + "time_per_iteration": 3.0274245738983154 + }, + { + "auxiliary_loss_clip": 0.01574991, + "auxiliary_loss_mlp": 0.01489912, + "balance_loss_clip": 1.20685816, + "balance_loss_mlp": 1.11721611, + "epoch": 0.13322912282811278, + "flos": 18262439487360.0, + "grad_norm": 2.887030863135352, + "language_loss": 0.85164678, + "learning_rate": 3.889359928736485e-06, + "loss": 0.88229579, + "num_input_tokens_seen": 23541330, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.72460938, + "step": 1108, + "time_per_iteration": 2.9099953174591064 + }, + { + "auxiliary_loss_clip": 0.01577456, + "auxiliary_loss_mlp": 0.01482996, + "balance_loss_clip": 1.2092104, + "balance_loss_mlp": 1.10915589, + "epoch": 0.1333493657187519, + "flos": 24463110803040.0, + "grad_norm": 2.246090731062931, + "language_loss": 0.91491902, + "learning_rate": 3.889104287764451e-06, + "loss": 0.9455235, + "num_input_tokens_seen": 23561705, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.734375, + "step": 1109, + "time_per_iteration": 3.056213855743408 + }, + { + "auxiliary_loss_clip": 0.01577257, + "auxiliary_loss_mlp": 0.01468491, + "balance_loss_clip": 1.20881009, + "balance_loss_mlp": 1.09522247, + "epoch": 0.13346960860939097, + "flos": 22160848538880.0, + "grad_norm": 2.3360431598486886, + "language_loss": 0.9049114, + "learning_rate": 3.888848360216798e-06, + "loss": 0.9353689, + "num_input_tokens_seen": 23579350, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.72851562, + "step": 1110, + "time_per_iteration": 3.084862470626831 + }, + { + "auxiliary_loss_clip": 0.01691791, + "auxiliary_loss_mlp": 0.01507301, + "balance_loss_clip": 1.32957828, + "balance_loss_mlp": 1.21128082, + "epoch": 0.13358985150003005, + "flos": 67938811528320.0, + "grad_norm": 0.8566522013067104, + "language_loss": 0.56545377, + "learning_rate": 3.888592146132351e-06, + "loss": 0.59744465, + "num_input_tokens_seen": 23640620, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.953125, + "step": 1111, + "time_per_iteration": 3.627044677734375 + }, + { + "auxiliary_loss_clip": 0.01577717, + "auxiliary_loss_mlp": 0.01481015, + "balance_loss_clip": 1.20942879, + "balance_loss_mlp": 1.10889173, + "epoch": 0.13371009439066917, + "flos": 26836754592960.0, + "grad_norm": 1.9009324768573537, + "language_loss": 0.78566277, + "learning_rate": 3.888335645549978e-06, + "loss": 0.81625009, + "num_input_tokens_seen": 23661040, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.71875, + "step": 1112, + "time_per_iteration": 2.990203619003296 + }, + { + "auxiliary_loss_clip": 0.01579706, + "auxiliary_loss_mlp": 0.01475081, + "balance_loss_clip": 1.21121025, + "balance_loss_mlp": 1.09914231, + "epoch": 0.13383033728130825, + "flos": 26325250326720.0, + "grad_norm": 3.102293895063116, + "language_loss": 0.82112801, + "learning_rate": 3.888078858508588e-06, + "loss": 0.85167587, + "num_input_tokens_seen": 23680900, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.75585938, + "step": 1113, + "time_per_iteration": 2.978856086730957 + }, + { + "auxiliary_loss_clip": 0.01580549, + "auxiliary_loss_mlp": 0.01471418, + "balance_loss_clip": 1.21247673, + "balance_loss_mlp": 1.08899498, + "epoch": 0.13395058017194733, + "flos": 22566077007840.0, + "grad_norm": 2.2215038343179794, + "language_loss": 0.84396744, + "learning_rate": 3.8878217850471365e-06, + "loss": 0.87448716, + "num_input_tokens_seen": 23700815, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.82226562, + "step": 1114, + "time_per_iteration": 2.935671091079712 + }, + { + "auxiliary_loss_clip": 0.01576025, + "auxiliary_loss_mlp": 0.01472613, + "balance_loss_clip": 1.20748293, + "balance_loss_mlp": 1.08923626, + "epoch": 0.13407082306258641, + "flos": 25813252994400.0, + "grad_norm": 2.2713132648830405, + "language_loss": 0.73857141, + "learning_rate": 3.887564425204621e-06, + "loss": 0.76905775, + "num_input_tokens_seen": 23722500, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.83007812, + "step": 1115, + "time_per_iteration": 2.9833836555480957 + }, + { + "auxiliary_loss_clip": 0.01685198, + "auxiliary_loss_mlp": 0.01472672, + "balance_loss_clip": 1.32240653, + "balance_loss_mlp": 1.13316345, + "epoch": 0.13419106595322552, + "flos": 68344267930560.0, + "grad_norm": 0.9236219936057884, + "language_loss": 0.54973543, + "learning_rate": 3.887306779020083e-06, + "loss": 0.58131409, + "num_input_tokens_seen": 23777155, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 3.3984375, + "step": 1116, + "time_per_iteration": 3.377739191055298 + }, + { + "auxiliary_loss_clip": 0.01577069, + "auxiliary_loss_mlp": 0.01489734, + "balance_loss_clip": 1.20995045, + "balance_loss_mlp": 1.10807383, + "epoch": 0.1343113088438646, + "flos": 20451181675680.0, + "grad_norm": 2.4249592546820575, + "language_loss": 0.70732093, + "learning_rate": 3.887048846532608e-06, + "loss": 0.73798901, + "num_input_tokens_seen": 23794130, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.81445312, + "step": 1117, + "time_per_iteration": 3.739161252975464 + }, + { + "auxiliary_loss_clip": 0.01685316, + "auxiliary_loss_mlp": 0.01430267, + "balance_loss_clip": 1.32291961, + "balance_loss_mlp": 1.08999634, + "epoch": 0.1344315517345037, + "flos": 67395637248480.0, + "grad_norm": 0.7765361104595174, + "language_loss": 0.58072698, + "learning_rate": 3.8867906277813224e-06, + "loss": 0.61188275, + "num_input_tokens_seen": 23852285, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 3.40625, + "step": 1118, + "time_per_iteration": 3.2349140644073486 + }, + { + "auxiliary_loss_clip": 0.01582678, + "auxiliary_loss_mlp": 0.01477048, + "balance_loss_clip": 1.21314692, + "balance_loss_mlp": 1.1003468, + "epoch": 0.1345517946251428, + "flos": 40737639895200.0, + "grad_norm": 2.4816951854731775, + "language_loss": 0.73930931, + "learning_rate": 3.886532122805399e-06, + "loss": 0.76990664, + "num_input_tokens_seen": 23874765, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.76367188, + "step": 1119, + "time_per_iteration": 3.0588483810424805 + }, + { + "auxiliary_loss_clip": 0.01580115, + "auxiliary_loss_mlp": 0.01484481, + "balance_loss_clip": 1.21320152, + "balance_loss_mlp": 1.10129476, + "epoch": 0.13467203751578188, + "flos": 22818870744480.0, + "grad_norm": 3.0333610146697896, + "language_loss": 0.89927948, + "learning_rate": 3.886273331644053e-06, + "loss": 0.92992544, + "num_input_tokens_seen": 23893635, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.83007812, + "step": 1120, + "time_per_iteration": 3.8322718143463135 + }, + { + "auxiliary_loss_clip": 0.01576327, + "auxiliary_loss_mlp": 0.01488151, + "balance_loss_clip": 1.20816648, + "balance_loss_mlp": 1.10248566, + "epoch": 0.13479228040642097, + "flos": 17093444009760.0, + "grad_norm": 2.698271017503519, + "language_loss": 0.82658863, + "learning_rate": 3.886014254336542e-06, + "loss": 0.85723346, + "num_input_tokens_seen": 23910110, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.85546875, + "step": 1121, + "time_per_iteration": 3.0322890281677246 + }, + { + "auxiliary_loss_clip": 0.01580923, + "auxiliary_loss_mlp": 0.01486968, + "balance_loss_clip": 1.2130481, + "balance_loss_mlp": 1.1047349, + "epoch": 0.13491252329706005, + "flos": 23732683011360.0, + "grad_norm": 2.6110617019340396, + "language_loss": 0.92668951, + "learning_rate": 3.885754890922168e-06, + "loss": 0.95736843, + "num_input_tokens_seen": 23930440, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.81835938, + "step": 1122, + "time_per_iteration": 2.9244134426116943 + }, + { + "auxiliary_loss_clip": 0.01580986, + "auxiliary_loss_mlp": 0.0148481, + "balance_loss_clip": 1.21254325, + "balance_loss_mlp": 1.10543787, + "epoch": 0.13503276618769916, + "flos": 34129691989920.0, + "grad_norm": 1.9430068546134436, + "language_loss": 0.78817523, + "learning_rate": 3.885495241440277e-06, + "loss": 0.81883311, + "num_input_tokens_seen": 23954535, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.7890625, + "step": 1123, + "time_per_iteration": 3.865621328353882 + }, + { + "auxiliary_loss_clip": 0.01581088, + "auxiliary_loss_mlp": 0.01465289, + "balance_loss_clip": 1.21269727, + "balance_loss_mlp": 1.09221137, + "epoch": 0.13515300907833824, + "flos": 17714258326080.0, + "grad_norm": 2.090010790636617, + "language_loss": 0.74090141, + "learning_rate": 3.885235305930257e-06, + "loss": 0.77136517, + "num_input_tokens_seen": 23972735, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.7265625, + "step": 1124, + "time_per_iteration": 3.0560312271118164 + }, + { + "auxiliary_loss_clip": 0.01591882, + "auxiliary_loss_mlp": 0.01481245, + "balance_loss_clip": 1.22442436, + "balance_loss_mlp": 1.11141014, + "epoch": 0.13527325196897733, + "flos": 20262638970720.0, + "grad_norm": 1.9517067758979123, + "language_loss": 0.85224617, + "learning_rate": 3.884975084431539e-06, + "loss": 0.88297749, + "num_input_tokens_seen": 23987685, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.69335938, + "step": 1125, + "time_per_iteration": 2.8960039615631104 + }, + { + "auxiliary_loss_clip": 0.01584331, + "auxiliary_loss_mlp": 0.01494932, + "balance_loss_clip": 1.21698618, + "balance_loss_mlp": 1.11823082, + "epoch": 0.13539349485961644, + "flos": 18188440918560.0, + "grad_norm": 7.270590466062873, + "language_loss": 0.91939962, + "learning_rate": 3.8847145769836e-06, + "loss": 0.95019227, + "num_input_tokens_seen": 24004105, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.76171875, + "step": 1126, + "time_per_iteration": 2.990835428237915 + }, + { + "auxiliary_loss_clip": 0.01580898, + "auxiliary_loss_mlp": 0.01510855, + "balance_loss_clip": 1.21233845, + "balance_loss_mlp": 1.14807677, + "epoch": 0.13551373775025552, + "flos": 19319469943680.0, + "grad_norm": 3.4691167721520384, + "language_loss": 0.66370177, + "learning_rate": 3.884453783625959e-06, + "loss": 0.6946193, + "num_input_tokens_seen": 24021715, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.62695312, + "step": 1127, + "time_per_iteration": 2.934425115585327 + }, + { + "auxiliary_loss_clip": 0.01584437, + "auxiliary_loss_mlp": 0.01488747, + "balance_loss_clip": 1.2153399, + "balance_loss_mlp": 1.11910272, + "epoch": 0.1356339806408946, + "flos": 20852958682080.0, + "grad_norm": 2.575144149545281, + "language_loss": 0.8503468, + "learning_rate": 3.884192704398176e-06, + "loss": 0.88107872, + "num_input_tokens_seen": 24038915, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.69140625, + "step": 1128, + "time_per_iteration": 3.067682981491089 + }, + { + "auxiliary_loss_clip": 0.01573877, + "auxiliary_loss_mlp": 0.01494272, + "balance_loss_clip": 1.20621419, + "balance_loss_mlp": 1.12901437, + "epoch": 0.13575422353153369, + "flos": 50479774705440.0, + "grad_norm": 2.1904910258750596, + "language_loss": 0.74411219, + "learning_rate": 3.883931339339858e-06, + "loss": 0.77479362, + "num_input_tokens_seen": 24063300, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.65234375, + "step": 1129, + "time_per_iteration": 3.2328426837921143 + }, + { + "auxiliary_loss_clip": 0.01574756, + "auxiliary_loss_mlp": 0.01509035, + "balance_loss_clip": 1.20767581, + "balance_loss_mlp": 1.14625692, + "epoch": 0.1358744664221728, + "flos": 18152788083840.0, + "grad_norm": 2.0467399489702776, + "language_loss": 0.79217833, + "learning_rate": 3.883669688490654e-06, + "loss": 0.82301623, + "num_input_tokens_seen": 24081070, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.625, + "step": 1130, + "time_per_iteration": 2.9235284328460693 + }, + { + "auxiliary_loss_clip": 0.01578267, + "auxiliary_loss_mlp": 0.01485199, + "balance_loss_clip": 1.21124673, + "balance_loss_mlp": 1.11421943, + "epoch": 0.13599470931281188, + "flos": 18444989543040.0, + "grad_norm": 4.193098758734294, + "language_loss": 0.85782129, + "learning_rate": 3.883407751890256e-06, + "loss": 0.88845599, + "num_input_tokens_seen": 24099675, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.70703125, + "step": 1131, + "time_per_iteration": 2.98069429397583 + }, + { + "auxiliary_loss_clip": 0.01574148, + "auxiliary_loss_mlp": 0.01499113, + "balance_loss_clip": 1.2064749, + "balance_loss_mlp": 1.12565422, + "epoch": 0.13611495220345096, + "flos": 26682802734240.0, + "grad_norm": 1.8554820945268813, + "language_loss": 0.85714555, + "learning_rate": 3.8831455295783994e-06, + "loss": 0.88787818, + "num_input_tokens_seen": 24118925, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.73046875, + "step": 1132, + "time_per_iteration": 3.052978754043579 + }, + { + "auxiliary_loss_clip": 0.0157057, + "auxiliary_loss_mlp": 0.01474092, + "balance_loss_clip": 1.20239234, + "balance_loss_mlp": 1.1067369, + "epoch": 0.13623519509409007, + "flos": 21688221000960.0, + "grad_norm": 2.0441444530552824, + "language_loss": 0.74150205, + "learning_rate": 3.882883021594864e-06, + "loss": 0.7719487, + "num_input_tokens_seen": 24137065, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.66992188, + "step": 1133, + "time_per_iteration": 3.072843074798584 + }, + { + "auxiliary_loss_clip": 0.01578823, + "auxiliary_loss_mlp": 0.01495121, + "balance_loss_clip": 1.21058846, + "balance_loss_mlp": 1.12757492, + "epoch": 0.13635543798472916, + "flos": 14832182450880.0, + "grad_norm": 2.2741346408471994, + "language_loss": 0.86915541, + "learning_rate": 3.8826202279794705e-06, + "loss": 0.89989483, + "num_input_tokens_seen": 24154125, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.67382812, + "step": 1134, + "time_per_iteration": 3.0025882720947266 + }, + { + "auxiliary_loss_clip": 0.01577913, + "auxiliary_loss_mlp": 0.01474384, + "balance_loss_clip": 1.20992517, + "balance_loss_mlp": 1.09997177, + "epoch": 0.13647568087536824, + "flos": 22892376247200.0, + "grad_norm": 2.8641052926978197, + "language_loss": 0.70314205, + "learning_rate": 3.882357148772085e-06, + "loss": 0.73366505, + "num_input_tokens_seen": 24171550, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.7421875, + "step": 1135, + "time_per_iteration": 2.9358465671539307 + }, + { + "auxiliary_loss_clip": 0.01572331, + "auxiliary_loss_mlp": 0.01474831, + "balance_loss_clip": 1.20283127, + "balance_loss_mlp": 1.09431505, + "epoch": 0.13659592376600732, + "flos": 19939980834720.0, + "grad_norm": 2.6866078505346795, + "language_loss": 0.84473288, + "learning_rate": 3.882093784012617e-06, + "loss": 0.8752045, + "num_input_tokens_seen": 24190190, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.79882812, + "step": 1136, + "time_per_iteration": 2.96309494972229 + }, + { + "auxiliary_loss_clip": 0.01576072, + "auxiliary_loss_mlp": 0.01476734, + "balance_loss_clip": 1.20786572, + "balance_loss_mlp": 1.10232139, + "epoch": 0.13671616665664643, + "flos": 21430420747200.0, + "grad_norm": 2.1562109337738784, + "language_loss": 0.8438803, + "learning_rate": 3.881830133741019e-06, + "loss": 0.87440836, + "num_input_tokens_seen": 24209055, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.74023438, + "step": 1137, + "time_per_iteration": 2.969571590423584 + }, + { + "auxiliary_loss_clip": 0.01570726, + "auxiliary_loss_mlp": 0.01477695, + "balance_loss_clip": 1.20189035, + "balance_loss_mlp": 1.1023283, + "epoch": 0.13683640954728551, + "flos": 22780031944320.0, + "grad_norm": 2.3647044820408887, + "language_loss": 0.76621825, + "learning_rate": 3.881566197997285e-06, + "loss": 0.79670244, + "num_input_tokens_seen": 24225490, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.75195312, + "step": 1138, + "time_per_iteration": 2.9171299934387207 + }, + { + "auxiliary_loss_clip": 0.01566252, + "auxiliary_loss_mlp": 0.01479045, + "balance_loss_clip": 1.19854021, + "balance_loss_mlp": 1.09814799, + "epoch": 0.1369566524379246, + "flos": 21728045933280.0, + "grad_norm": 1.689811352186292, + "language_loss": 0.74850368, + "learning_rate": 3.881301976821456e-06, + "loss": 0.77895665, + "num_input_tokens_seen": 24245520, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.8046875, + "step": 1139, + "time_per_iteration": 3.065749406814575 + }, + { + "auxiliary_loss_clip": 0.01568765, + "auxiliary_loss_mlp": 0.01469344, + "balance_loss_clip": 1.20003939, + "balance_loss_mlp": 1.08901906, + "epoch": 0.1370768953285637, + "flos": 18626439682080.0, + "grad_norm": 2.0801337985009907, + "language_loss": 0.90686232, + "learning_rate": 3.881037470253612e-06, + "loss": 0.93724346, + "num_input_tokens_seen": 24265035, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.796875, + "step": 1140, + "time_per_iteration": 2.9678750038146973 + }, + { + "auxiliary_loss_clip": 0.01572392, + "auxiliary_loss_mlp": 0.01482608, + "balance_loss_clip": 1.20397687, + "balance_loss_mlp": 1.10647917, + "epoch": 0.1371971382192028, + "flos": 14941302860160.0, + "grad_norm": 2.7963641696949413, + "language_loss": 0.79694772, + "learning_rate": 3.88077267833388e-06, + "loss": 0.82749772, + "num_input_tokens_seen": 24281550, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.7578125, + "step": 1141, + "time_per_iteration": 3.037586212158203 + }, + { + "auxiliary_loss_clip": 0.01567563, + "auxiliary_loss_mlp": 0.01472432, + "balance_loss_clip": 1.19749177, + "balance_loss_mlp": 1.09096229, + "epoch": 0.13731738110984187, + "flos": 19025675501760.0, + "grad_norm": 2.493190659266812, + "language_loss": 0.84207678, + "learning_rate": 3.880507601102427e-06, + "loss": 0.8724767, + "num_input_tokens_seen": 24299485, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.81054688, + "step": 1142, + "time_per_iteration": 3.1460843086242676 + }, + { + "auxiliary_loss_clip": 0.01568058, + "auxiliary_loss_mlp": 0.01471223, + "balance_loss_clip": 1.19926894, + "balance_loss_mlp": 1.081743, + "epoch": 0.13743762400048098, + "flos": 18189464978880.0, + "grad_norm": 1.8685887271436652, + "language_loss": 0.82574761, + "learning_rate": 3.880242238599467e-06, + "loss": 0.85614038, + "num_input_tokens_seen": 24316010, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.890625, + "step": 1143, + "time_per_iteration": 2.94050931930542 + }, + { + "auxiliary_loss_clip": 0.01568322, + "auxiliary_loss_mlp": 0.01492337, + "balance_loss_clip": 1.19774508, + "balance_loss_mlp": 1.11067629, + "epoch": 0.13755786689112007, + "flos": 21034067467680.0, + "grad_norm": 1.9676919944840727, + "language_loss": 0.83452749, + "learning_rate": 3.879976590865254e-06, + "loss": 0.865134, + "num_input_tokens_seen": 24335465, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.81445312, + "step": 1144, + "time_per_iteration": 3.9340929985046387 + }, + { + "auxiliary_loss_clip": 0.01563145, + "auxiliary_loss_mlp": 0.01478789, + "balance_loss_clip": 1.19228625, + "balance_loss_mlp": 1.09865427, + "epoch": 0.13767810978175915, + "flos": 21362756181120.0, + "grad_norm": 3.0468028576121022, + "language_loss": 0.87550563, + "learning_rate": 3.879710657940087e-06, + "loss": 0.90592498, + "num_input_tokens_seen": 24354415, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.79882812, + "step": 1145, + "time_per_iteration": 3.1159491539001465 + }, + { + "auxiliary_loss_clip": 0.01564135, + "auxiliary_loss_mlp": 0.0148519, + "balance_loss_clip": 1.19237161, + "balance_loss_mlp": 1.10276651, + "epoch": 0.13779835267239823, + "flos": 30594752138880.0, + "grad_norm": 5.052793498541945, + "language_loss": 0.70433152, + "learning_rate": 3.879444439864308e-06, + "loss": 0.73482478, + "num_input_tokens_seen": 24373990, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.82226562, + "step": 1146, + "time_per_iteration": 3.0241634845733643 + }, + { + "auxiliary_loss_clip": 0.0155786, + "auxiliary_loss_mlp": 0.01479104, + "balance_loss_clip": 1.18753397, + "balance_loss_mlp": 1.09935069, + "epoch": 0.13791859556303734, + "flos": 22671821810880.0, + "grad_norm": 2.113699826694774, + "language_loss": 0.86066413, + "learning_rate": 3.879177936678301e-06, + "loss": 0.89103371, + "num_input_tokens_seen": 24392995, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.79296875, + "step": 1147, + "time_per_iteration": 3.8612220287323 + }, + { + "auxiliary_loss_clip": 0.01557482, + "auxiliary_loss_mlp": 0.01486951, + "balance_loss_clip": 1.18615341, + "balance_loss_mlp": 1.10662508, + "epoch": 0.13803883845367643, + "flos": 35227002516480.0, + "grad_norm": 1.934031124062276, + "language_loss": 0.77585649, + "learning_rate": 3.878911148422496e-06, + "loss": 0.80630082, + "num_input_tokens_seen": 24414470, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.79882812, + "step": 1148, + "time_per_iteration": 3.130476713180542 + }, + { + "auxiliary_loss_clip": 0.01562553, + "auxiliary_loss_mlp": 0.01476755, + "balance_loss_clip": 1.19126987, + "balance_loss_mlp": 1.09852791, + "epoch": 0.1381590813443155, + "flos": 32017261988160.0, + "grad_norm": 3.343512656369828, + "language_loss": 0.70594335, + "learning_rate": 3.878644075137364e-06, + "loss": 0.73633641, + "num_input_tokens_seen": 24435120, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.77929688, + "step": 1149, + "time_per_iteration": 3.0757501125335693 + }, + { + "auxiliary_loss_clip": 0.01557485, + "auxiliary_loss_mlp": 0.01484486, + "balance_loss_clip": 1.18463075, + "balance_loss_mlp": 1.11045456, + "epoch": 0.13827932423495462, + "flos": 17823795945120.0, + "grad_norm": 8.176819294674347, + "language_loss": 0.7947346, + "learning_rate": 3.878376716863418e-06, + "loss": 0.8251543, + "num_input_tokens_seen": 24451420, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.73632812, + "step": 1150, + "time_per_iteration": 2.971921920776367 + }, + { + "auxiliary_loss_clip": 0.01556876, + "auxiliary_loss_mlp": 0.01477653, + "balance_loss_clip": 1.18430686, + "balance_loss_mlp": 1.10285878, + "epoch": 0.1383995671255937, + "flos": 19429121347200.0, + "grad_norm": 2.762711910828453, + "language_loss": 0.71770543, + "learning_rate": 3.878109073641219e-06, + "loss": 0.74805069, + "num_input_tokens_seen": 24470450, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.74609375, + "step": 1151, + "time_per_iteration": 3.8646414279937744 + }, + { + "auxiliary_loss_clip": 0.01554134, + "auxiliary_loss_mlp": 0.01474426, + "balance_loss_clip": 1.17981696, + "balance_loss_mlp": 1.09963179, + "epoch": 0.13851981001623279, + "flos": 28299316943520.0, + "grad_norm": 1.6318314039593478, + "language_loss": 0.81388342, + "learning_rate": 3.877841145511366e-06, + "loss": 0.84416902, + "num_input_tokens_seen": 24493190, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.74414062, + "step": 1152, + "time_per_iteration": 3.0144879817962646 + }, + { + "auxiliary_loss_clip": 0.01559789, + "auxiliary_loss_mlp": 0.01482066, + "balance_loss_clip": 1.18545747, + "balance_loss_mlp": 1.11242199, + "epoch": 0.13864005290687187, + "flos": 21215100396960.0, + "grad_norm": 2.025686612169658, + "language_loss": 0.82969034, + "learning_rate": 3.8775729325145035e-06, + "loss": 0.86010891, + "num_input_tokens_seen": 24512425, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.69140625, + "step": 1153, + "time_per_iteration": 3.050262212753296 + }, + { + "auxiliary_loss_clip": 0.01687176, + "auxiliary_loss_mlp": 0.01576122, + "balance_loss_clip": 1.31224728, + "balance_loss_mlp": 1.28162766, + "epoch": 0.13876029579751098, + "flos": 71661232095840.0, + "grad_norm": 0.8454107046478635, + "language_loss": 0.64672238, + "learning_rate": 3.877304434691321e-06, + "loss": 0.67935532, + "num_input_tokens_seen": 24579275, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 2.9375, + "step": 1154, + "time_per_iteration": 3.5632143020629883 + }, + { + "auxiliary_loss_clip": 0.01557681, + "auxiliary_loss_mlp": 0.01495087, + "balance_loss_clip": 1.18286943, + "balance_loss_mlp": 1.12887573, + "epoch": 0.13888053868815006, + "flos": 21943328355360.0, + "grad_norm": 2.030248358833148, + "language_loss": 0.79790777, + "learning_rate": 3.877035652082548e-06, + "loss": 0.82843542, + "num_input_tokens_seen": 24598720, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.66015625, + "step": 1155, + "time_per_iteration": 3.0159847736358643 + }, + { + "auxiliary_loss_clip": 0.01556715, + "auxiliary_loss_mlp": 0.0148221, + "balance_loss_clip": 1.18155682, + "balance_loss_mlp": 1.10951471, + "epoch": 0.13900078157878915, + "flos": 19610609414400.0, + "grad_norm": 1.6973567461468586, + "language_loss": 0.85384786, + "learning_rate": 3.87676658472896e-06, + "loss": 0.88423717, + "num_input_tokens_seen": 24617530, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.72265625, + "step": 1156, + "time_per_iteration": 3.061939239501953 + }, + { + "auxiliary_loss_clip": 0.01550524, + "auxiliary_loss_mlp": 0.01486998, + "balance_loss_clip": 1.17578292, + "balance_loss_mlp": 1.11468387, + "epoch": 0.13912102446942826, + "flos": 22640265217440.0, + "grad_norm": 2.1937925094430586, + "language_loss": 0.85497034, + "learning_rate": 3.876497232671372e-06, + "loss": 0.88534558, + "num_input_tokens_seen": 24637485, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.72070312, + "step": 1157, + "time_per_iteration": 2.972797155380249 + }, + { + "auxiliary_loss_clip": 0.01558086, + "auxiliary_loss_mlp": 0.01481889, + "balance_loss_clip": 1.18332767, + "balance_loss_mlp": 1.11338878, + "epoch": 0.13924126736006734, + "flos": 29645931816000.0, + "grad_norm": 2.4454078682206686, + "language_loss": 0.83796751, + "learning_rate": 3.876227595950647e-06, + "loss": 0.86836731, + "num_input_tokens_seen": 24656915, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.68359375, + "step": 1158, + "time_per_iteration": 3.0077919960021973 + }, + { + "auxiliary_loss_clip": 0.01554652, + "auxiliary_loss_mlp": 0.01484369, + "balance_loss_clip": 1.17897189, + "balance_loss_mlp": 1.11129117, + "epoch": 0.13936151025070642, + "flos": 27420133451040.0, + "grad_norm": 2.59671445885363, + "language_loss": 0.79206467, + "learning_rate": 3.875957674607686e-06, + "loss": 0.82245481, + "num_input_tokens_seen": 24679190, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.72851562, + "step": 1159, + "time_per_iteration": 2.977548360824585 + }, + { + "auxiliary_loss_clip": 0.01549497, + "auxiliary_loss_mlp": 0.01491519, + "balance_loss_clip": 1.17237532, + "balance_loss_mlp": 1.1148181, + "epoch": 0.1394817531413455, + "flos": 16401210239520.0, + "grad_norm": 2.4646722220580255, + "language_loss": 0.88153785, + "learning_rate": 3.8756874686834386e-06, + "loss": 0.91194803, + "num_input_tokens_seen": 24697405, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.765625, + "step": 1160, + "time_per_iteration": 2.9767916202545166 + }, + { + "auxiliary_loss_clip": 0.01547863, + "auxiliary_loss_mlp": 0.01480786, + "balance_loss_clip": 1.17033887, + "balance_loss_mlp": 1.10561061, + "epoch": 0.13960199603198462, + "flos": 30925109691360.0, + "grad_norm": 1.7498024679435005, + "language_loss": 0.80658424, + "learning_rate": 3.875416978218893e-06, + "loss": 0.83687079, + "num_input_tokens_seen": 24720600, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.74804688, + "step": 1161, + "time_per_iteration": 3.0723788738250732 + }, + { + "auxiliary_loss_clip": 0.01553134, + "auxiliary_loss_mlp": 0.01487516, + "balance_loss_clip": 1.17645442, + "balance_loss_mlp": 1.10146868, + "epoch": 0.1397222389226237, + "flos": 18115731907200.0, + "grad_norm": 2.741120760350565, + "language_loss": 0.83163756, + "learning_rate": 3.8751462032550835e-06, + "loss": 0.8620441, + "num_input_tokens_seen": 24737605, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.859375, + "step": 1162, + "time_per_iteration": 2.9675424098968506 + }, + { + "auxiliary_loss_clip": 0.01556104, + "auxiliary_loss_mlp": 0.01477218, + "balance_loss_clip": 1.17821908, + "balance_loss_mlp": 1.09708333, + "epoch": 0.13984248181326278, + "flos": 16874634268800.0, + "grad_norm": 3.126147489166357, + "language_loss": 0.83083332, + "learning_rate": 3.874875143833085e-06, + "loss": 0.86116648, + "num_input_tokens_seen": 24755845, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.79882812, + "step": 1163, + "time_per_iteration": 3.037797212600708 + }, + { + "auxiliary_loss_clip": 0.01547361, + "auxiliary_loss_mlp": 0.01476804, + "balance_loss_clip": 1.16970968, + "balance_loss_mlp": 1.10105634, + "epoch": 0.1399627247039019, + "flos": 54125162451360.0, + "grad_norm": 2.5211353100289227, + "language_loss": 0.69055897, + "learning_rate": 3.874603799994019e-06, + "loss": 0.72080064, + "num_input_tokens_seen": 24779380, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.75195312, + "step": 1164, + "time_per_iteration": 3.2626209259033203 + }, + { + "auxiliary_loss_clip": 0.01560397, + "auxiliary_loss_mlp": 0.01475135, + "balance_loss_clip": 1.18290448, + "balance_loss_mlp": 1.09347486, + "epoch": 0.14008296759454097, + "flos": 11767139310240.0, + "grad_norm": 2.6325780284762734, + "language_loss": 0.86949271, + "learning_rate": 3.874332171779046e-06, + "loss": 0.89984798, + "num_input_tokens_seen": 24794260, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.8125, + "step": 1165, + "time_per_iteration": 2.9218811988830566 + }, + { + "auxiliary_loss_clip": 0.0155009, + "auxiliary_loss_mlp": 0.01486446, + "balance_loss_clip": 1.17382646, + "balance_loss_mlp": 1.10173368, + "epoch": 0.14020321048518006, + "flos": 22019678470080.0, + "grad_norm": 4.857554086606851, + "language_loss": 0.75939667, + "learning_rate": 3.874060259229373e-06, + "loss": 0.78976202, + "num_input_tokens_seen": 24815835, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.84375, + "step": 1166, + "time_per_iteration": 3.081425905227661 + }, + { + "auxiliary_loss_clip": 0.01557883, + "auxiliary_loss_mlp": 0.01473991, + "balance_loss_clip": 1.17909431, + "balance_loss_mlp": 1.0930934, + "epoch": 0.14032345337581917, + "flos": 23406573412800.0, + "grad_norm": 3.707361951612107, + "language_loss": 0.94096613, + "learning_rate": 3.873788062386249e-06, + "loss": 0.97128487, + "num_input_tokens_seen": 24834095, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 3.8046875, + "step": 1167, + "time_per_iteration": 2.9686851501464844 + }, + { + "auxiliary_loss_clip": 0.01562489, + "auxiliary_loss_mlp": 0.0148204, + "balance_loss_clip": 1.18481982, + "balance_loss_mlp": 1.1072464, + "epoch": 0.14044369626645825, + "flos": 29648510930880.0, + "grad_norm": 3.0108597983985175, + "language_loss": 0.82293969, + "learning_rate": 3.873515581290965e-06, + "loss": 0.85338497, + "num_input_tokens_seen": 24858900, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.74609375, + "step": 1168, + "time_per_iteration": 3.1435976028442383 + }, + { + "auxiliary_loss_clip": 0.01566142, + "auxiliary_loss_mlp": 0.01471199, + "balance_loss_clip": 1.19001269, + "balance_loss_mlp": 1.09850347, + "epoch": 0.14056393915709733, + "flos": 18334958857920.0, + "grad_norm": 2.6978104533188767, + "language_loss": 0.75823104, + "learning_rate": 3.8732428159848575e-06, + "loss": 0.7886045, + "num_input_tokens_seen": 24877875, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.72460938, + "step": 1169, + "time_per_iteration": 3.050631284713745 + }, + { + "auxiliary_loss_clip": 0.01562221, + "auxiliary_loss_mlp": 0.01489275, + "balance_loss_clip": 1.18444538, + "balance_loss_mlp": 1.10933113, + "epoch": 0.14068418204773642, + "flos": 26689933228320.0, + "grad_norm": 2.1435191054266736, + "language_loss": 0.78824657, + "learning_rate": 3.872969766509304e-06, + "loss": 0.81876147, + "num_input_tokens_seen": 24898430, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.79882812, + "step": 1170, + "time_per_iteration": 3.086366891860962 + }, + { + "auxiliary_loss_clip": 0.01772118, + "auxiliary_loss_mlp": 0.01467926, + "balance_loss_clip": 1.39590383, + "balance_loss_mlp": 1.10247803, + "epoch": 0.14080442493837553, + "flos": 65266290923040.0, + "grad_norm": 0.7907362162581747, + "language_loss": 0.55607659, + "learning_rate": 3.872696432905726e-06, + "loss": 0.58847702, + "num_input_tokens_seen": 24959250, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.6484375, + "step": 1171, + "time_per_iteration": 4.306023597717285 + }, + { + "auxiliary_loss_clip": 0.01554561, + "auxiliary_loss_mlp": 0.01470864, + "balance_loss_clip": 1.17690945, + "balance_loss_mlp": 1.09378111, + "epoch": 0.1409246678290146, + "flos": 25779155214240.0, + "grad_norm": 2.8838835173545436, + "language_loss": 0.71904373, + "learning_rate": 3.872422815215589e-06, + "loss": 0.74929798, + "num_input_tokens_seen": 24978330, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.765625, + "step": 1172, + "time_per_iteration": 3.129734754562378 + }, + { + "auxiliary_loss_clip": 0.01550858, + "auxiliary_loss_mlp": 0.0148278, + "balance_loss_clip": 1.17298698, + "balance_loss_mlp": 1.10646021, + "epoch": 0.1410449107196537, + "flos": 21870505559520.0, + "grad_norm": 2.2164451263384057, + "language_loss": 0.74772882, + "learning_rate": 3.8721489134803994e-06, + "loss": 0.7780652, + "num_input_tokens_seen": 24997120, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.75976562, + "step": 1173, + "time_per_iteration": 3.1191370487213135 + }, + { + "auxiliary_loss_clip": 0.01566634, + "auxiliary_loss_mlp": 0.01474335, + "balance_loss_clip": 1.18754184, + "balance_loss_mlp": 1.10278332, + "epoch": 0.1411651536102928, + "flos": 16685636425920.0, + "grad_norm": 2.8314049231773346, + "language_loss": 0.72705942, + "learning_rate": 3.871874727741707e-06, + "loss": 0.75746912, + "num_input_tokens_seen": 25014350, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 3.7109375, + "step": 1174, + "time_per_iteration": 3.955097198486328 + }, + { + "auxiliary_loss_clip": 0.0156508, + "auxiliary_loss_mlp": 0.01470058, + "balance_loss_clip": 1.18762958, + "balance_loss_mlp": 1.10308433, + "epoch": 0.1412853965009319, + "flos": 20994166679040.0, + "grad_norm": 1.7990080593185367, + "language_loss": 0.97084689, + "learning_rate": 3.871600258041108e-06, + "loss": 1.00119829, + "num_input_tokens_seen": 25033875, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.66601562, + "step": 1175, + "time_per_iteration": 3.8352560997009277 + }, + { + "auxiliary_loss_clip": 0.01563267, + "auxiliary_loss_mlp": 0.01471829, + "balance_loss_clip": 1.18667865, + "balance_loss_mlp": 1.0966537, + "epoch": 0.14140563939157097, + "flos": 20337130605600.0, + "grad_norm": 2.6539690914750778, + "language_loss": 0.85629928, + "learning_rate": 3.871325504420238e-06, + "loss": 0.8866502, + "num_input_tokens_seen": 25052865, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.74804688, + "step": 1176, + "time_per_iteration": 2.975020170211792 + }, + { + "auxiliary_loss_clip": 0.01566218, + "auxiliary_loss_mlp": 0.0147891, + "balance_loss_clip": 1.18957996, + "balance_loss_mlp": 1.1161325, + "epoch": 0.14152588228221005, + "flos": 21070934003520.0, + "grad_norm": 2.143698863438832, + "language_loss": 0.81958801, + "learning_rate": 3.871050466920776e-06, + "loss": 0.85003924, + "num_input_tokens_seen": 25072770, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.625, + "step": 1177, + "time_per_iteration": 3.024606704711914 + }, + { + "auxiliary_loss_clip": 0.01569675, + "auxiliary_loss_mlp": 0.01479121, + "balance_loss_clip": 1.19390607, + "balance_loss_mlp": 1.11062169, + "epoch": 0.14164612517284916, + "flos": 18225117813600.0, + "grad_norm": 2.024549393069066, + "language_loss": 0.79989922, + "learning_rate": 3.870775145584447e-06, + "loss": 0.83038723, + "num_input_tokens_seen": 25090550, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.68164062, + "step": 1178, + "time_per_iteration": 3.961181640625 + }, + { + "auxiliary_loss_clip": 0.01566976, + "auxiliary_loss_mlp": 0.01490279, + "balance_loss_clip": 1.19029891, + "balance_loss_mlp": 1.12216091, + "epoch": 0.14176636806348825, + "flos": 22746503086560.0, + "grad_norm": 3.882402792388874, + "language_loss": 0.64944959, + "learning_rate": 3.8704995404530145e-06, + "loss": 0.68002212, + "num_input_tokens_seen": 25106175, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.6796875, + "step": 1179, + "time_per_iteration": 3.061739206314087 + }, + { + "auxiliary_loss_clip": 0.0157437, + "auxiliary_loss_mlp": 0.01489352, + "balance_loss_clip": 1.20006037, + "balance_loss_mlp": 1.12638378, + "epoch": 0.14188661095412733, + "flos": 22093297757280.0, + "grad_norm": 2.1619009594402194, + "language_loss": 0.85229182, + "learning_rate": 3.87022365156829e-06, + "loss": 0.88292903, + "num_input_tokens_seen": 25126890, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.62695312, + "step": 1180, + "time_per_iteration": 3.0278096199035645 + }, + { + "auxiliary_loss_clip": 0.0157006, + "auxiliary_loss_mlp": 0.01481739, + "balance_loss_clip": 1.19455957, + "balance_loss_mlp": 1.11514688, + "epoch": 0.14200685384476644, + "flos": 24354673100640.0, + "grad_norm": 2.111949684105288, + "language_loss": 0.81190324, + "learning_rate": 3.869947478972123e-06, + "loss": 0.84242123, + "num_input_tokens_seen": 25147915, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.6640625, + "step": 1181, + "time_per_iteration": 3.2467269897460938 + }, + { + "auxiliary_loss_clip": 0.01569998, + "auxiliary_loss_mlp": 0.01490472, + "balance_loss_clip": 1.19412279, + "balance_loss_mlp": 1.12006474, + "epoch": 0.14212709673540552, + "flos": 24024201763680.0, + "grad_norm": 2.433478416232141, + "language_loss": 0.82519734, + "learning_rate": 3.869671022706412e-06, + "loss": 0.85580206, + "num_input_tokens_seen": 25166645, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.70117188, + "step": 1182, + "time_per_iteration": 3.117187023162842 + }, + { + "auxiliary_loss_clip": 0.0157617, + "auxiliary_loss_mlp": 0.01488222, + "balance_loss_clip": 1.20237172, + "balance_loss_mlp": 1.12410891, + "epoch": 0.1422473396260446, + "flos": 26434181095200.0, + "grad_norm": 3.1755762755424075, + "language_loss": 0.65293294, + "learning_rate": 3.869394282813092e-06, + "loss": 0.68357682, + "num_input_tokens_seen": 25185845, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.63867188, + "step": 1183, + "time_per_iteration": 3.185214042663574 + }, + { + "auxiliary_loss_clip": 0.01581523, + "auxiliary_loss_mlp": 0.01497689, + "balance_loss_clip": 1.20645666, + "balance_loss_mlp": 1.13224101, + "epoch": 0.1423675825166837, + "flos": 17057184324480.0, + "grad_norm": 3.814003096920533, + "language_loss": 0.89344954, + "learning_rate": 3.869117259334147e-06, + "loss": 0.92424166, + "num_input_tokens_seen": 25203770, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.65039062, + "step": 1184, + "time_per_iteration": 3.2020320892333984 + }, + { + "auxiliary_loss_clip": 0.01580537, + "auxiliary_loss_mlp": 0.01483463, + "balance_loss_clip": 1.20714498, + "balance_loss_mlp": 1.12087607, + "epoch": 0.1424878254073228, + "flos": 17931323371680.0, + "grad_norm": 2.0845430188363876, + "language_loss": 0.81848514, + "learning_rate": 3.868839952311599e-06, + "loss": 0.84912515, + "num_input_tokens_seen": 25221725, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.62304688, + "step": 1185, + "time_per_iteration": 3.1036696434020996 + }, + { + "auxiliary_loss_clip": 0.01578666, + "auxiliary_loss_mlp": 0.01481543, + "balance_loss_clip": 1.20489597, + "balance_loss_mlp": 1.1149509, + "epoch": 0.14260806829796188, + "flos": 20305725724800.0, + "grad_norm": 2.7739562384160976, + "language_loss": 0.80622506, + "learning_rate": 3.868562361787516e-06, + "loss": 0.83682716, + "num_input_tokens_seen": 25240855, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.66601562, + "step": 1186, + "time_per_iteration": 3.117335081100464 + }, + { + "auxiliary_loss_clip": 0.0157531, + "auxiliary_loss_mlp": 0.01478004, + "balance_loss_clip": 1.20193946, + "balance_loss_mlp": 1.11160207, + "epoch": 0.14272831118860096, + "flos": 23187991240800.0, + "grad_norm": 3.41568296812642, + "language_loss": 0.69087487, + "learning_rate": 3.868284487804009e-06, + "loss": 0.72140801, + "num_input_tokens_seen": 25260085, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.66210938, + "step": 1187, + "time_per_iteration": 3.0781519412994385 + }, + { + "auxiliary_loss_clip": 0.01583321, + "auxiliary_loss_mlp": 0.01469592, + "balance_loss_clip": 1.20868611, + "balance_loss_mlp": 1.09517932, + "epoch": 0.14284855407924008, + "flos": 27234511214400.0, + "grad_norm": 2.3218400352169177, + "language_loss": 0.78250706, + "learning_rate": 3.86800633040323e-06, + "loss": 0.81303614, + "num_input_tokens_seen": 25280675, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.7421875, + "step": 1188, + "time_per_iteration": 3.1570658683776855 + }, + { + "auxiliary_loss_clip": 0.01574594, + "auxiliary_loss_mlp": 0.01483242, + "balance_loss_clip": 1.20174646, + "balance_loss_mlp": 1.11207199, + "epoch": 0.14296879696987916, + "flos": 28186745071680.0, + "grad_norm": 2.4791426739526456, + "language_loss": 0.78695905, + "learning_rate": 3.867727889627376e-06, + "loss": 0.81753743, + "num_input_tokens_seen": 25300290, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.70898438, + "step": 1189, + "time_per_iteration": 3.1610445976257324 + }, + { + "auxiliary_loss_clip": 0.01579942, + "auxiliary_loss_mlp": 0.01491422, + "balance_loss_clip": 1.206285, + "balance_loss_mlp": 1.12521148, + "epoch": 0.14308903986051824, + "flos": 19392292739520.0, + "grad_norm": 2.688581358178617, + "language_loss": 0.78276157, + "learning_rate": 3.867449165518687e-06, + "loss": 0.81347519, + "num_input_tokens_seen": 25316760, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 3.66210938, + "step": 1190, + "time_per_iteration": 3.0138025283813477 + }, + { + "auxiliary_loss_clip": 0.01572501, + "auxiliary_loss_mlp": 0.01477204, + "balance_loss_clip": 1.19984698, + "balance_loss_mlp": 1.10603428, + "epoch": 0.14320928275115732, + "flos": 17459454396960.0, + "grad_norm": 3.986908843310345, + "language_loss": 0.71337783, + "learning_rate": 3.867170158119444e-06, + "loss": 0.74387491, + "num_input_tokens_seen": 25335760, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.70898438, + "step": 1191, + "time_per_iteration": 3.0241737365722656 + }, + { + "auxiliary_loss_clip": 0.01582171, + "auxiliary_loss_mlp": 0.01503808, + "balance_loss_clip": 1.21040082, + "balance_loss_mlp": 1.14083922, + "epoch": 0.14332952564179643, + "flos": 21467969989920.0, + "grad_norm": 2.010011899834762, + "language_loss": 0.75391263, + "learning_rate": 3.866890867471972e-06, + "loss": 0.78477246, + "num_input_tokens_seen": 25354230, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.625, + "step": 1192, + "time_per_iteration": 3.0185654163360596 + }, + { + "auxiliary_loss_clip": 0.01578019, + "auxiliary_loss_mlp": 0.01483871, + "balance_loss_clip": 1.20521355, + "balance_loss_mlp": 1.11842322, + "epoch": 0.14344976853243552, + "flos": 16398744909120.0, + "grad_norm": 3.272956159979752, + "language_loss": 0.89638674, + "learning_rate": 3.86661129361864e-06, + "loss": 0.92700565, + "num_input_tokens_seen": 25368720, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.65625, + "step": 1193, + "time_per_iteration": 2.952566146850586 + }, + { + "auxiliary_loss_clip": 0.01573408, + "auxiliary_loss_mlp": 0.01488421, + "balance_loss_clip": 1.20101547, + "balance_loss_mlp": 1.11667883, + "epoch": 0.1435700114230746, + "flos": 18918754925760.0, + "grad_norm": 2.328928361267658, + "language_loss": 0.86243379, + "learning_rate": 3.866331436601859e-06, + "loss": 0.8930521, + "num_input_tokens_seen": 25386715, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.7109375, + "step": 1194, + "time_per_iteration": 3.027294874191284 + }, + { + "auxiliary_loss_clip": 0.01590669, + "auxiliary_loss_mlp": 0.01485847, + "balance_loss_clip": 1.21927333, + "balance_loss_mlp": 1.11582112, + "epoch": 0.1436902543137137, + "flos": 19757430779040.0, + "grad_norm": 2.4099727319260946, + "language_loss": 0.73804337, + "learning_rate": 3.866051296464083e-06, + "loss": 0.76880848, + "num_input_tokens_seen": 25405550, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.69726562, + "step": 1195, + "time_per_iteration": 2.9819395542144775 + }, + { + "auxiliary_loss_clip": 0.01574977, + "auxiliary_loss_mlp": 0.01492979, + "balance_loss_clip": 1.20281422, + "balance_loss_mlp": 1.1240983, + "epoch": 0.1438104972043528, + "flos": 14686612715520.0, + "grad_norm": 2.4974258066211172, + "language_loss": 0.85262352, + "learning_rate": 3.86577087324781e-06, + "loss": 0.88330311, + "num_input_tokens_seen": 25422040, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.6875, + "step": 1196, + "time_per_iteration": 2.943114757537842 + }, + { + "auxiliary_loss_clip": 0.01586491, + "auxiliary_loss_mlp": 0.0148277, + "balance_loss_clip": 1.21454704, + "balance_loss_mlp": 1.12113714, + "epoch": 0.14393074009499188, + "flos": 17094240501120.0, + "grad_norm": 2.9423423721500686, + "language_loss": 0.7779035, + "learning_rate": 3.865490166995578e-06, + "loss": 0.80859613, + "num_input_tokens_seen": 25440270, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.6171875, + "step": 1197, + "time_per_iteration": 2.9613759517669678 + }, + { + "auxiliary_loss_clip": 0.01582492, + "auxiliary_loss_mlp": 0.01485837, + "balance_loss_clip": 1.20973253, + "balance_loss_mlp": 1.12305915, + "epoch": 0.144050982985631, + "flos": 30478501235520.0, + "grad_norm": 4.19880557343401, + "language_loss": 0.84269679, + "learning_rate": 3.86520917774997e-06, + "loss": 0.87338012, + "num_input_tokens_seen": 25459705, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.625, + "step": 1198, + "time_per_iteration": 2.9923903942108154 + }, + { + "auxiliary_loss_clip": 0.01582333, + "auxiliary_loss_mlp": 0.01483664, + "balance_loss_clip": 1.21022725, + "balance_loss_mlp": 1.11440182, + "epoch": 0.14417122587627007, + "flos": 17860434912000.0, + "grad_norm": 2.22195347164805, + "language_loss": 0.75501704, + "learning_rate": 3.864927905553614e-06, + "loss": 0.78567708, + "num_input_tokens_seen": 25477615, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.69140625, + "step": 1199, + "time_per_iteration": 3.8463997840881348 + }, + { + "auxiliary_loss_clip": 0.01579627, + "auxiliary_loss_mlp": 0.01470673, + "balance_loss_clip": 1.20574045, + "balance_loss_mlp": 1.09378088, + "epoch": 0.14429146876690915, + "flos": 21615929199360.0, + "grad_norm": 2.3342148851048536, + "language_loss": 0.89092296, + "learning_rate": 3.8646463504491765e-06, + "loss": 0.92142594, + "num_input_tokens_seen": 25497750, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.765625, + "step": 1200, + "time_per_iteration": 3.180781602859497 + }, + { + "auxiliary_loss_clip": 0.01581652, + "auxiliary_loss_mlp": 0.0148305, + "balance_loss_clip": 1.21147454, + "balance_loss_mlp": 1.11283326, + "epoch": 0.14441171165754824, + "flos": 23260358898720.0, + "grad_norm": 2.0537857977410816, + "language_loss": 0.83245027, + "learning_rate": 3.8643645124793705e-06, + "loss": 0.86309725, + "num_input_tokens_seen": 25516650, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.69921875, + "step": 1201, + "time_per_iteration": 3.0138394832611084 + }, + { + "auxiliary_loss_clip": 0.0157818, + "auxiliary_loss_mlp": 0.01478738, + "balance_loss_clip": 1.20678043, + "balance_loss_mlp": 1.10413456, + "epoch": 0.14453195454818735, + "flos": 42857731385280.0, + "grad_norm": 1.6905547647176877, + "language_loss": 0.74928772, + "learning_rate": 3.8640823916869515e-06, + "loss": 0.77985686, + "num_input_tokens_seen": 25540960, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.7421875, + "step": 1202, + "time_per_iteration": 4.891374588012695 + }, + { + "auxiliary_loss_clip": 0.01578481, + "auxiliary_loss_mlp": 0.01458883, + "balance_loss_clip": 1.20779848, + "balance_loss_mlp": 1.08141863, + "epoch": 0.14465219743882643, + "flos": 27238379886720.0, + "grad_norm": 1.7978216904817348, + "language_loss": 0.78650451, + "learning_rate": 3.863799988114714e-06, + "loss": 0.8168782, + "num_input_tokens_seen": 25562990, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.77148438, + "step": 1203, + "time_per_iteration": 3.0963125228881836 + }, + { + "auxiliary_loss_clip": 0.0158178, + "auxiliary_loss_mlp": 0.01477654, + "balance_loss_clip": 1.21128452, + "balance_loss_mlp": 1.09637499, + "epoch": 0.1447724403294655, + "flos": 16692577279200.0, + "grad_norm": 5.606806039857887, + "language_loss": 0.70502925, + "learning_rate": 3.863517301805502e-06, + "loss": 0.73562354, + "num_input_tokens_seen": 25581380, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 3.81054688, + "step": 1204, + "time_per_iteration": 2.9873762130737305 + }, + { + "auxiliary_loss_clip": 0.01588649, + "auxiliary_loss_mlp": 0.01484118, + "balance_loss_clip": 1.21797752, + "balance_loss_mlp": 1.10760784, + "epoch": 0.14489268322010462, + "flos": 20075234110560.0, + "grad_norm": 2.488571389826508, + "language_loss": 0.96970683, + "learning_rate": 3.863234332802196e-06, + "loss": 1.00043452, + "num_input_tokens_seen": 25593585, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.76171875, + "step": 1205, + "time_per_iteration": 3.7728354930877686 + }, + { + "auxiliary_loss_clip": 0.01594736, + "auxiliary_loss_mlp": 0.01466394, + "balance_loss_clip": 1.22374642, + "balance_loss_mlp": 1.08358955, + "epoch": 0.1450129261107437, + "flos": 27128083704480.0, + "grad_norm": 2.962914047153942, + "language_loss": 0.7458297, + "learning_rate": 3.862951081147723e-06, + "loss": 0.77644104, + "num_input_tokens_seen": 25613750, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.82617188, + "step": 1206, + "time_per_iteration": 3.0149009227752686 + }, + { + "auxiliary_loss_clip": 0.01596893, + "auxiliary_loss_mlp": 0.01478206, + "balance_loss_clip": 1.22549295, + "balance_loss_mlp": 1.10112286, + "epoch": 0.1451331690013828, + "flos": 25704701507520.0, + "grad_norm": 3.453305170351007, + "language_loss": 0.78086615, + "learning_rate": 3.862667546885053e-06, + "loss": 0.81161714, + "num_input_tokens_seen": 25632300, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.76757812, + "step": 1207, + "time_per_iteration": 3.0228335857391357 + }, + { + "auxiliary_loss_clip": 0.01578381, + "auxiliary_loss_mlp": 0.01482874, + "balance_loss_clip": 1.20685768, + "balance_loss_mlp": 1.09396529, + "epoch": 0.14525341189202187, + "flos": 25739671635360.0, + "grad_norm": 4.230958611830903, + "language_loss": 0.73819876, + "learning_rate": 3.8623837300571965e-06, + "loss": 0.76881135, + "num_input_tokens_seen": 25651285, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.88867188, + "step": 1208, + "time_per_iteration": 2.976360321044922 + }, + { + "auxiliary_loss_clip": 0.01587852, + "auxiliary_loss_mlp": 0.0148262, + "balance_loss_clip": 1.21688318, + "balance_loss_mlp": 1.10553741, + "epoch": 0.14537365478266098, + "flos": 23076026219520.0, + "grad_norm": 2.5891779012595633, + "language_loss": 0.83990383, + "learning_rate": 3.8620996307072085e-06, + "loss": 0.87060857, + "num_input_tokens_seen": 25671990, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.76757812, + "step": 1209, + "time_per_iteration": 2.93645977973938 + }, + { + "auxiliary_loss_clip": 0.01582656, + "auxiliary_loss_mlp": 0.01474586, + "balance_loss_clip": 1.21273971, + "balance_loss_mlp": 1.09635878, + "epoch": 0.14549389767330007, + "flos": 20597130692640.0, + "grad_norm": 2.134163347153346, + "language_loss": 0.64972889, + "learning_rate": 3.861815248878188e-06, + "loss": 0.68030131, + "num_input_tokens_seen": 25689475, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.77929688, + "step": 1210, + "time_per_iteration": 2.9156088829040527 + }, + { + "auxiliary_loss_clip": 0.01586697, + "auxiliary_loss_mlp": 0.01469563, + "balance_loss_clip": 1.21620476, + "balance_loss_mlp": 1.09438741, + "epoch": 0.14561414056393915, + "flos": 15123587418720.0, + "grad_norm": 2.732433203239631, + "language_loss": 0.80180502, + "learning_rate": 3.861530584613274e-06, + "loss": 0.83236766, + "num_input_tokens_seen": 25707475, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.74804688, + "step": 1211, + "time_per_iteration": 3.095750093460083 + }, + { + "auxiliary_loss_clip": 0.01580913, + "auxiliary_loss_mlp": 0.01466742, + "balance_loss_clip": 1.21014166, + "balance_loss_mlp": 1.09480906, + "epoch": 0.14573438345457826, + "flos": 19429538556960.0, + "grad_norm": 2.377064352418935, + "language_loss": 0.8219763, + "learning_rate": 3.86124563795565e-06, + "loss": 0.85245287, + "num_input_tokens_seen": 25726290, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.71679688, + "step": 1212, + "time_per_iteration": 2.958308219909668 + }, + { + "auxiliary_loss_clip": 0.01587206, + "auxiliary_loss_mlp": 0.01481247, + "balance_loss_clip": 1.21745908, + "balance_loss_mlp": 1.10588062, + "epoch": 0.14585462634521734, + "flos": 24830827957440.0, + "grad_norm": 1.9832565672283184, + "language_loss": 0.7030257, + "learning_rate": 3.860960408948543e-06, + "loss": 0.73371017, + "num_input_tokens_seen": 25748040, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.74804688, + "step": 1213, + "time_per_iteration": 3.1070971488952637 + }, + { + "auxiliary_loss_clip": 0.01592119, + "auxiliary_loss_mlp": 0.01467552, + "balance_loss_clip": 1.22277474, + "balance_loss_mlp": 1.09848022, + "epoch": 0.14597486923585642, + "flos": 15450341796000.0, + "grad_norm": 3.263571680413074, + "language_loss": 0.89938056, + "learning_rate": 3.860674897635222e-06, + "loss": 0.92997724, + "num_input_tokens_seen": 25764525, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.68554688, + "step": 1214, + "time_per_iteration": 2.9467153549194336 + }, + { + "auxiliary_loss_clip": 0.01589158, + "auxiliary_loss_mlp": 0.01473944, + "balance_loss_clip": 1.21990633, + "balance_loss_mlp": 1.09533572, + "epoch": 0.1460951121264955, + "flos": 16657076157120.0, + "grad_norm": 2.7376594662190823, + "language_loss": 0.83777422, + "learning_rate": 3.860389104058998e-06, + "loss": 0.86840522, + "num_input_tokens_seen": 25782755, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.78320312, + "step": 1215, + "time_per_iteration": 2.9296507835388184 + }, + { + "auxiliary_loss_clip": 0.01581874, + "auxiliary_loss_mlp": 0.01482311, + "balance_loss_clip": 1.21279871, + "balance_loss_mlp": 1.11171281, + "epoch": 0.14621535501713462, + "flos": 24865608444480.0, + "grad_norm": 5.054014511243111, + "language_loss": 0.72677433, + "learning_rate": 3.860103028263227e-06, + "loss": 0.75741613, + "num_input_tokens_seen": 25805860, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.703125, + "step": 1216, + "time_per_iteration": 3.0646755695343018 + }, + { + "auxiliary_loss_clip": 0.01576307, + "auxiliary_loss_mlp": 0.01473323, + "balance_loss_clip": 1.20685339, + "balance_loss_mlp": 1.10882902, + "epoch": 0.1463355979077737, + "flos": 25230329274240.0, + "grad_norm": 2.2467290442853045, + "language_loss": 0.70153201, + "learning_rate": 3.859816670291304e-06, + "loss": 0.73202837, + "num_input_tokens_seen": 25824955, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.64257812, + "step": 1217, + "time_per_iteration": 3.0674338340759277 + }, + { + "auxiliary_loss_clip": 0.01580831, + "auxiliary_loss_mlp": 0.01473658, + "balance_loss_clip": 1.21165323, + "balance_loss_mlp": 1.10534918, + "epoch": 0.14645584079841278, + "flos": 22056317436960.0, + "grad_norm": 2.9909383655419766, + "language_loss": 0.90075707, + "learning_rate": 3.859530030186672e-06, + "loss": 0.93130195, + "num_input_tokens_seen": 25841965, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.67773438, + "step": 1218, + "time_per_iteration": 2.9766950607299805 + }, + { + "auxiliary_loss_clip": 0.01587045, + "auxiliary_loss_mlp": 0.01481462, + "balance_loss_clip": 1.21702623, + "balance_loss_mlp": 1.11410642, + "epoch": 0.1465760836890519, + "flos": 23626179645120.0, + "grad_norm": 3.501556002463767, + "language_loss": 0.83013743, + "learning_rate": 3.859243107992813e-06, + "loss": 0.86082244, + "num_input_tokens_seen": 25860770, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.671875, + "step": 1219, + "time_per_iteration": 3.038645029067993 + }, + { + "auxiliary_loss_clip": 0.01583, + "auxiliary_loss_mlp": 0.01482533, + "balance_loss_clip": 1.21386743, + "balance_loss_mlp": 1.11250734, + "epoch": 0.14669632657969098, + "flos": 37410548546880.0, + "grad_norm": 8.178156500870063, + "language_loss": 0.78038794, + "learning_rate": 3.858955903753252e-06, + "loss": 0.81104326, + "num_input_tokens_seen": 25879410, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.6953125, + "step": 1220, + "time_per_iteration": 3.065559148788452 + }, + { + "auxiliary_loss_clip": 0.01595458, + "auxiliary_loss_mlp": 0.01479866, + "balance_loss_clip": 1.22824383, + "balance_loss_mlp": 1.10430872, + "epoch": 0.14681656947033006, + "flos": 28368422779680.0, + "grad_norm": 3.7209355671502435, + "language_loss": 0.83588123, + "learning_rate": 3.858668417511559e-06, + "loss": 0.86663449, + "num_input_tokens_seen": 25902160, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.75390625, + "step": 1221, + "time_per_iteration": 3.069370985031128 + }, + { + "auxiliary_loss_clip": 0.01585741, + "auxiliary_loss_mlp": 0.01473182, + "balance_loss_clip": 1.21753442, + "balance_loss_mlp": 1.10792506, + "epoch": 0.14693681236096917, + "flos": 18481514725440.0, + "grad_norm": 2.3238149072946648, + "language_loss": 0.75832403, + "learning_rate": 3.8583806493113445e-06, + "loss": 0.78891331, + "num_input_tokens_seen": 25920505, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.65039062, + "step": 1222, + "time_per_iteration": 3.0539093017578125 + }, + { + "auxiliary_loss_clip": 0.01586197, + "auxiliary_loss_mlp": 0.01487235, + "balance_loss_clip": 1.21894062, + "balance_loss_mlp": 1.11358571, + "epoch": 0.14705705525160825, + "flos": 20779946245440.0, + "grad_norm": 2.1009746656646606, + "language_loss": 0.82498288, + "learning_rate": 3.858092599196263e-06, + "loss": 0.85571718, + "num_input_tokens_seen": 25938460, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.73242188, + "step": 1223, + "time_per_iteration": 2.9401111602783203 + }, + { + "auxiliary_loss_clip": 0.01585367, + "auxiliary_loss_mlp": 0.01475516, + "balance_loss_clip": 1.21603334, + "balance_loss_mlp": 1.10377359, + "epoch": 0.14717729814224734, + "flos": 29935440375840.0, + "grad_norm": 8.400711932761347, + "language_loss": 0.82391179, + "learning_rate": 3.857804267210012e-06, + "loss": 0.85452068, + "num_input_tokens_seen": 25957760, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.71679688, + "step": 1224, + "time_per_iteration": 3.0731303691864014 + }, + { + "auxiliary_loss_clip": 0.01582915, + "auxiliary_loss_mlp": 0.01489985, + "balance_loss_clip": 1.21519232, + "balance_loss_mlp": 1.11805212, + "epoch": 0.14729754103288642, + "flos": 20049556381920.0, + "grad_norm": 1.9913492014930343, + "language_loss": 0.8807705, + "learning_rate": 3.857515653396331e-06, + "loss": 0.9114995, + "num_input_tokens_seen": 25974970, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.71875, + "step": 1225, + "time_per_iteration": 3.0208396911621094 + }, + { + "auxiliary_loss_clip": 0.01588244, + "auxiliary_loss_mlp": 0.01486813, + "balance_loss_clip": 1.2194761, + "balance_loss_mlp": 1.11526132, + "epoch": 0.14741778392352553, + "flos": 19283551611840.0, + "grad_norm": 5.585418611287786, + "language_loss": 0.87346017, + "learning_rate": 3.857226757799002e-06, + "loss": 0.90421075, + "num_input_tokens_seen": 25992525, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.71289062, + "step": 1226, + "time_per_iteration": 3.9471986293792725 + }, + { + "auxiliary_loss_clip": 0.01583653, + "auxiliary_loss_mlp": 0.01470217, + "balance_loss_clip": 1.21602201, + "balance_loss_mlp": 1.08779407, + "epoch": 0.1475380268141646, + "flos": 25413486180480.0, + "grad_norm": 2.766674171342832, + "language_loss": 0.7498976, + "learning_rate": 3.85693758046185e-06, + "loss": 0.78043628, + "num_input_tokens_seen": 26010815, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.81835938, + "step": 1227, + "time_per_iteration": 3.080204725265503 + }, + { + "auxiliary_loss_clip": 0.01593913, + "auxiliary_loss_mlp": 0.01487177, + "balance_loss_clip": 1.22675514, + "balance_loss_mlp": 1.1223017, + "epoch": 0.1476582697048037, + "flos": 20849621004000.0, + "grad_norm": 2.7291603179665502, + "language_loss": 0.82875717, + "learning_rate": 3.8566481214287435e-06, + "loss": 0.85956806, + "num_input_tokens_seen": 26028935, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.64648438, + "step": 1228, + "time_per_iteration": 2.9010539054870605 + }, + { + "auxiliary_loss_clip": 0.01596548, + "auxiliary_loss_mlp": 0.01490438, + "balance_loss_clip": 1.22923887, + "balance_loss_mlp": 1.12251055, + "epoch": 0.1477785125954428, + "flos": 14029880067360.0, + "grad_norm": 2.475140768742951, + "language_loss": 0.90742564, + "learning_rate": 3.8563583807435935e-06, + "loss": 0.93829548, + "num_input_tokens_seen": 26045080, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.67773438, + "step": 1229, + "time_per_iteration": 3.8966357707977295 + }, + { + "auxiliary_loss_clip": 0.01582192, + "auxiliary_loss_mlp": 0.01461773, + "balance_loss_clip": 1.21564162, + "balance_loss_mlp": 1.08297312, + "epoch": 0.1478987554860819, + "flos": 20518504888320.0, + "grad_norm": 1.916895011140953, + "language_loss": 0.77686632, + "learning_rate": 3.856068358450353e-06, + "loss": 0.80730599, + "num_input_tokens_seen": 26065030, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.78515625, + "step": 1230, + "time_per_iteration": 3.813476800918579 + }, + { + "auxiliary_loss_clip": 0.01590652, + "auxiliary_loss_mlp": 0.01492557, + "balance_loss_clip": 1.2244662, + "balance_loss_mlp": 1.1282537, + "epoch": 0.14801899837672097, + "flos": 17858879857440.0, + "grad_norm": 1.8336274104030843, + "language_loss": 0.85766304, + "learning_rate": 3.8557780545930186e-06, + "loss": 0.88849509, + "num_input_tokens_seen": 26083445, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 3.64257812, + "step": 1231, + "time_per_iteration": 3.033437967300415 + }, + { + "auxiliary_loss_clip": 0.01580805, + "auxiliary_loss_mlp": 0.01482067, + "balance_loss_clip": 1.21384144, + "balance_loss_mlp": 1.10841751, + "epoch": 0.14813924126736006, + "flos": 20883225718080.0, + "grad_norm": 2.000727771227196, + "language_loss": 0.79155934, + "learning_rate": 3.855487469215628e-06, + "loss": 0.82218808, + "num_input_tokens_seen": 26102375, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.73242188, + "step": 1232, + "time_per_iteration": 3.7910618782043457 + }, + { + "auxiliary_loss_clip": 0.01594258, + "auxiliary_loss_mlp": 0.01468148, + "balance_loss_clip": 1.22653151, + "balance_loss_mlp": 1.09259105, + "epoch": 0.14825948415799917, + "flos": 37417072190400.0, + "grad_norm": 2.8387026701212092, + "language_loss": 0.72749043, + "learning_rate": 3.855196602362264e-06, + "loss": 0.75811446, + "num_input_tokens_seen": 26125295, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.75195312, + "step": 1233, + "time_per_iteration": 3.095278739929199 + }, + { + "auxiliary_loss_clip": 0.01580681, + "auxiliary_loss_mlp": 0.01471964, + "balance_loss_clip": 1.21140754, + "balance_loss_mlp": 1.09926844, + "epoch": 0.14837972704863825, + "flos": 22016644217280.0, + "grad_norm": 2.6464460481320162, + "language_loss": 0.94320595, + "learning_rate": 3.854905454077051e-06, + "loss": 0.97373247, + "num_input_tokens_seen": 26142905, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.72265625, + "step": 1234, + "time_per_iteration": 2.9028828144073486 + }, + { + "auxiliary_loss_clip": 0.01583025, + "auxiliary_loss_mlp": 0.01492837, + "balance_loss_clip": 1.21628118, + "balance_loss_mlp": 1.12452769, + "epoch": 0.14849996993927733, + "flos": 20998452561120.0, + "grad_norm": 1.8591537899657111, + "language_loss": 0.88319659, + "learning_rate": 3.854614024404155e-06, + "loss": 0.91395521, + "num_input_tokens_seen": 26161215, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.67773438, + "step": 1235, + "time_per_iteration": 3.094329595565796 + }, + { + "auxiliary_loss_clip": 0.01585444, + "auxiliary_loss_mlp": 0.014789, + "balance_loss_clip": 1.21661258, + "balance_loss_mlp": 1.1052506, + "epoch": 0.14862021282991644, + "flos": 20050087376160.0, + "grad_norm": 2.069087267617714, + "language_loss": 0.89200282, + "learning_rate": 3.8543223133877865e-06, + "loss": 0.92264622, + "num_input_tokens_seen": 26179810, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.734375, + "step": 1236, + "time_per_iteration": 3.086693525314331 + }, + { + "auxiliary_loss_clip": 0.01580127, + "auxiliary_loss_mlp": 0.01471939, + "balance_loss_clip": 1.21231627, + "balance_loss_mlp": 1.09600043, + "epoch": 0.14874045572055553, + "flos": 22714301714400.0, + "grad_norm": 1.9772834874775034, + "language_loss": 0.88281691, + "learning_rate": 3.854030321072198e-06, + "loss": 0.91333759, + "num_input_tokens_seen": 26199715, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.7578125, + "step": 1237, + "time_per_iteration": 2.9974429607391357 + }, + { + "auxiliary_loss_clip": 0.01581958, + "auxiliary_loss_mlp": 0.01470328, + "balance_loss_clip": 1.2143451, + "balance_loss_mlp": 1.09152818, + "epoch": 0.1488606986111946, + "flos": 25413675821280.0, + "grad_norm": 2.406358692057694, + "language_loss": 0.73254579, + "learning_rate": 3.853738047501682e-06, + "loss": 0.76306868, + "num_input_tokens_seen": 26220275, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.78515625, + "step": 1238, + "time_per_iteration": 3.14640212059021 + }, + { + "auxiliary_loss_clip": 0.01589205, + "auxiliary_loss_mlp": 0.01477797, + "balance_loss_clip": 1.21967387, + "balance_loss_mlp": 1.1054827, + "epoch": 0.1489809415018337, + "flos": 17020621213920.0, + "grad_norm": 1.8365260963205332, + "language_loss": 0.77344072, + "learning_rate": 3.85344549272058e-06, + "loss": 0.80411071, + "num_input_tokens_seen": 26238255, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.71875, + "step": 1239, + "time_per_iteration": 3.1376142501831055 + }, + { + "auxiliary_loss_clip": 0.01589978, + "auxiliary_loss_mlp": 0.0147772, + "balance_loss_clip": 1.22108603, + "balance_loss_mlp": 1.1019721, + "epoch": 0.1491011843924728, + "flos": 33662374394400.0, + "grad_norm": 1.9052551455475086, + "language_loss": 0.82586682, + "learning_rate": 3.853152656773269e-06, + "loss": 0.85654378, + "num_input_tokens_seen": 26259690, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.75195312, + "step": 1240, + "time_per_iteration": 3.1294400691986084 + }, + { + "auxiliary_loss_clip": 0.01583832, + "auxiliary_loss_mlp": 0.01476994, + "balance_loss_clip": 1.21422338, + "balance_loss_mlp": 1.10754013, + "epoch": 0.14922142728311188, + "flos": 21181154329440.0, + "grad_norm": 2.052800214949062, + "language_loss": 0.85096705, + "learning_rate": 3.852859539704174e-06, + "loss": 0.88157535, + "num_input_tokens_seen": 26278990, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.69140625, + "step": 1241, + "time_per_iteration": 3.200814962387085 + }, + { + "auxiliary_loss_clip": 0.01585762, + "auxiliary_loss_mlp": 0.01470712, + "balance_loss_clip": 1.21674311, + "balance_loss_mlp": 1.10011399, + "epoch": 0.14934167017375097, + "flos": 29863072717920.0, + "grad_norm": 1.9486031909333643, + "language_loss": 0.76644599, + "learning_rate": 3.85256614155776e-06, + "loss": 0.79701078, + "num_input_tokens_seen": 26299120, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.70117188, + "step": 1242, + "time_per_iteration": 3.0171844959259033 + }, + { + "auxiliary_loss_clip": 0.01579592, + "auxiliary_loss_mlp": 0.01487171, + "balance_loss_clip": 1.2114135, + "balance_loss_mlp": 1.11638236, + "epoch": 0.14946191306439008, + "flos": 17021114280000.0, + "grad_norm": 3.216683977277725, + "language_loss": 0.74468285, + "learning_rate": 3.852272462378535e-06, + "loss": 0.77535051, + "num_input_tokens_seen": 26316995, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.70507812, + "step": 1243, + "time_per_iteration": 3.04618239402771 + }, + { + "auxiliary_loss_clip": 0.01582758, + "auxiliary_loss_mlp": 0.01474073, + "balance_loss_clip": 1.21282423, + "balance_loss_mlp": 1.10404706, + "epoch": 0.14958215595502916, + "flos": 15670858304160.0, + "grad_norm": 4.876908881266662, + "language_loss": 0.77839828, + "learning_rate": 3.85197850221105e-06, + "loss": 0.80896652, + "num_input_tokens_seen": 26333295, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.69726562, + "step": 1244, + "time_per_iteration": 3.051366090774536 + }, + { + "auxiliary_loss_clip": 0.0158442, + "auxiliary_loss_mlp": 0.01477346, + "balance_loss_clip": 1.21572959, + "balance_loss_mlp": 1.11590338, + "epoch": 0.14970239884566824, + "flos": 33111310692960.0, + "grad_norm": 1.8923819372677857, + "language_loss": 0.76117432, + "learning_rate": 3.851684261099899e-06, + "loss": 0.79179198, + "num_input_tokens_seen": 26355035, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.61328125, + "step": 1245, + "time_per_iteration": 3.084806203842163 + }, + { + "auxiliary_loss_clip": 0.0158457, + "auxiliary_loss_mlp": 0.01478947, + "balance_loss_clip": 1.21529627, + "balance_loss_mlp": 1.11121011, + "epoch": 0.14982264173630733, + "flos": 17823151166400.0, + "grad_norm": 2.5459364985857817, + "language_loss": 0.8682729, + "learning_rate": 3.851389739089718e-06, + "loss": 0.89890808, + "num_input_tokens_seen": 26371655, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.67382812, + "step": 1246, + "time_per_iteration": 3.1130309104919434 + }, + { + "auxiliary_loss_clip": 0.01593501, + "auxiliary_loss_mlp": 0.01490069, + "balance_loss_clip": 1.22240484, + "balance_loss_mlp": 1.12042451, + "epoch": 0.14994288462694644, + "flos": 32411946428640.0, + "grad_norm": 2.2363024374540688, + "language_loss": 0.80569494, + "learning_rate": 3.851094936225186e-06, + "loss": 0.83653069, + "num_input_tokens_seen": 26392540, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.6953125, + "step": 1247, + "time_per_iteration": 3.1440060138702393 + }, + { + "auxiliary_loss_clip": 0.01585773, + "auxiliary_loss_mlp": 0.01470419, + "balance_loss_clip": 1.2155602, + "balance_loss_mlp": 1.10420859, + "epoch": 0.15006312751758552, + "flos": 31797314402400.0, + "grad_norm": 1.7722355284678069, + "language_loss": 0.76791811, + "learning_rate": 3.850799852551024e-06, + "loss": 0.79848003, + "num_input_tokens_seen": 26414960, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.66015625, + "step": 1248, + "time_per_iteration": 3.0293824672698975 + }, + { + "auxiliary_loss_clip": 0.01588899, + "auxiliary_loss_mlp": 0.01473404, + "balance_loss_clip": 1.2183615, + "balance_loss_mlp": 1.10757494, + "epoch": 0.1501833704082246, + "flos": 16619906196000.0, + "grad_norm": 4.967586180877344, + "language_loss": 0.86239862, + "learning_rate": 3.850504488111995e-06, + "loss": 0.8930217, + "num_input_tokens_seen": 26431635, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.65625, + "step": 1249, + "time_per_iteration": 3.006610631942749 + }, + { + "auxiliary_loss_clip": 0.01592039, + "auxiliary_loss_mlp": 0.014932, + "balance_loss_clip": 1.22231328, + "balance_loss_mlp": 1.12927759, + "epoch": 0.15030361329886371, + "flos": 23473100134080.0, + "grad_norm": 1.8631863004592488, + "language_loss": 0.82918704, + "learning_rate": 3.850208842952907e-06, + "loss": 0.86003935, + "num_input_tokens_seen": 26450440, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.63867188, + "step": 1250, + "time_per_iteration": 2.9762113094329834 + }, + { + "auxiliary_loss_clip": 0.01579951, + "auxiliary_loss_mlp": 0.0147061, + "balance_loss_clip": 1.20972753, + "balance_loss_mlp": 1.09886765, + "epoch": 0.1504238561895028, + "flos": 25631764927200.0, + "grad_norm": 1.8201676151265533, + "language_loss": 0.79266602, + "learning_rate": 3.849912917118608e-06, + "loss": 0.82317162, + "num_input_tokens_seen": 26471480, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.71484375, + "step": 1251, + "time_per_iteration": 3.0931618213653564 + }, + { + "auxiliary_loss_clip": 0.01821143, + "auxiliary_loss_mlp": 0.01471764, + "balance_loss_clip": 1.45909977, + "balance_loss_mlp": 1.16048431, + "epoch": 0.15054409908014188, + "flos": 52101788133600.0, + "grad_norm": 0.8907534702877529, + "language_loss": 0.5917114, + "learning_rate": 3.849616710653992e-06, + "loss": 0.62464046, + "num_input_tokens_seen": 26532950, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 3.109375, + "step": 1252, + "time_per_iteration": 4.309167146682739 + }, + { + "auxiliary_loss_clip": 0.01583663, + "auxiliary_loss_mlp": 0.01484205, + "balance_loss_clip": 1.21435761, + "balance_loss_mlp": 1.11589622, + "epoch": 0.150664341970781, + "flos": 18882267671520.0, + "grad_norm": 2.238761873751245, + "language_loss": 0.74643934, + "learning_rate": 3.84932022360399e-06, + "loss": 0.77711809, + "num_input_tokens_seen": 26551615, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.68164062, + "step": 1253, + "time_per_iteration": 3.00779128074646 + }, + { + "auxiliary_loss_clip": 0.01597863, + "auxiliary_loss_mlp": 0.01488972, + "balance_loss_clip": 1.22868824, + "balance_loss_mlp": 1.12295198, + "epoch": 0.15078458486142007, + "flos": 22165399918080.0, + "grad_norm": 2.932868473923094, + "language_loss": 0.84152007, + "learning_rate": 3.849023456013581e-06, + "loss": 0.87238848, + "num_input_tokens_seen": 26569175, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.65820312, + "step": 1254, + "time_per_iteration": 3.0687649250030518 + }, + { + "auxiliary_loss_clip": 0.01581527, + "auxiliary_loss_mlp": 0.01474119, + "balance_loss_clip": 1.21196032, + "balance_loss_mlp": 1.0972265, + "epoch": 0.15090482775205916, + "flos": 26654356249920.0, + "grad_norm": 2.999196413672473, + "language_loss": 0.62356555, + "learning_rate": 3.848726407927784e-06, + "loss": 0.65412199, + "num_input_tokens_seen": 26589560, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.765625, + "step": 1255, + "time_per_iteration": 3.0364151000976562 + }, + { + "auxiliary_loss_clip": 0.01582722, + "auxiliary_loss_mlp": 0.01483, + "balance_loss_clip": 1.21313834, + "balance_loss_mlp": 1.11564469, + "epoch": 0.15102507064269824, + "flos": 21801172154400.0, + "grad_norm": 3.1254533461455303, + "language_loss": 0.86824667, + "learning_rate": 3.84842907939166e-06, + "loss": 0.89890385, + "num_input_tokens_seen": 26608785, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.66992188, + "step": 1256, + "time_per_iteration": 3.8860480785369873 + }, + { + "auxiliary_loss_clip": 0.01590678, + "auxiliary_loss_mlp": 0.01485064, + "balance_loss_clip": 1.22099161, + "balance_loss_mlp": 1.11313057, + "epoch": 0.15114531353333735, + "flos": 22823156626560.0, + "grad_norm": 2.72562662269751, + "language_loss": 0.71083117, + "learning_rate": 3.8481314704503146e-06, + "loss": 0.74158859, + "num_input_tokens_seen": 26628615, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.71679688, + "step": 1257, + "time_per_iteration": 3.1206142902374268 + }, + { + "auxiliary_loss_clip": 0.01611704, + "auxiliary_loss_mlp": 0.0148161, + "balance_loss_clip": 1.2409296, + "balance_loss_mlp": 1.10910511, + "epoch": 0.15126555642397643, + "flos": 19684721767680.0, + "grad_norm": 2.850330931646687, + "language_loss": 0.88587463, + "learning_rate": 3.847833581148895e-06, + "loss": 0.91680783, + "num_input_tokens_seen": 26647525, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.72070312, + "step": 1258, + "time_per_iteration": 3.938260555267334 + }, + { + "auxiliary_loss_clip": 0.01585375, + "auxiliary_loss_mlp": 0.01469374, + "balance_loss_clip": 1.21504164, + "balance_loss_mlp": 1.09362602, + "epoch": 0.15138579931461552, + "flos": 28728288804960.0, + "grad_norm": 3.6290375217573905, + "language_loss": 0.81213737, + "learning_rate": 3.84753541153259e-06, + "loss": 0.84268486, + "num_input_tokens_seen": 26667095, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.75390625, + "step": 1259, + "time_per_iteration": 3.0127017498016357 + }, + { + "auxiliary_loss_clip": 0.01597144, + "auxiliary_loss_mlp": 0.01484972, + "balance_loss_clip": 1.22609067, + "balance_loss_mlp": 1.11246705, + "epoch": 0.15150604220525463, + "flos": 22129178160960.0, + "grad_norm": 1.8012333911588099, + "language_loss": 0.83332777, + "learning_rate": 3.847236961646633e-06, + "loss": 0.86414891, + "num_input_tokens_seen": 26686075, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.72265625, + "step": 1260, + "time_per_iteration": 3.853720188140869 + }, + { + "auxiliary_loss_clip": 0.01595018, + "auxiliary_loss_mlp": 0.014652, + "balance_loss_clip": 1.2255218, + "balance_loss_mlp": 1.09002459, + "epoch": 0.1516262850958937, + "flos": 12970346352480.0, + "grad_norm": 2.2914175619261665, + "language_loss": 0.78289515, + "learning_rate": 3.846938231536296e-06, + "loss": 0.8134973, + "num_input_tokens_seen": 26701695, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.75, + "step": 1261, + "time_per_iteration": 3.08719801902771 + }, + { + "auxiliary_loss_clip": 0.01605997, + "auxiliary_loss_mlp": 0.01481106, + "balance_loss_clip": 1.23637772, + "balance_loss_mlp": 1.09849143, + "epoch": 0.1517465279865328, + "flos": 21799199890080.0, + "grad_norm": 1.7370402324227643, + "language_loss": 0.8107543, + "learning_rate": 3.8466392212468995e-06, + "loss": 0.84162533, + "num_input_tokens_seen": 26721885, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.82617188, + "step": 1262, + "time_per_iteration": 3.097360849380493 + }, + { + "auxiliary_loss_clip": 0.01804615, + "auxiliary_loss_mlp": 0.01442856, + "balance_loss_clip": 1.44115901, + "balance_loss_mlp": 1.10105896, + "epoch": 0.15186677087717187, + "flos": 58180784819040.0, + "grad_norm": 0.8357283567833377, + "language_loss": 0.61915177, + "learning_rate": 3.8463399308238e-06, + "loss": 0.65162641, + "num_input_tokens_seen": 26780990, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 3.421875, + "step": 1263, + "time_per_iteration": 3.4375579357147217 + }, + { + "auxiliary_loss_clip": 0.01599539, + "auxiliary_loss_mlp": 0.01479138, + "balance_loss_clip": 1.22811139, + "balance_loss_mlp": 1.10701466, + "epoch": 0.15198701376781099, + "flos": 32672363725440.0, + "grad_norm": 2.5001946696596953, + "language_loss": 0.63955331, + "learning_rate": 3.846040360312402e-06, + "loss": 0.67034006, + "num_input_tokens_seen": 26804250, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.71875, + "step": 1264, + "time_per_iteration": 3.120738983154297 + }, + { + "auxiliary_loss_clip": 0.0159249, + "auxiliary_loss_mlp": 0.01485811, + "balance_loss_clip": 1.22110796, + "balance_loss_mlp": 1.11769319, + "epoch": 0.15210725665845007, + "flos": 28405099674720.0, + "grad_norm": 2.391993703630935, + "language_loss": 0.81796193, + "learning_rate": 3.8457405097581485e-06, + "loss": 0.84874493, + "num_input_tokens_seen": 26823240, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.67773438, + "step": 1265, + "time_per_iteration": 3.042130470275879 + }, + { + "auxiliary_loss_clip": 0.01598656, + "auxiliary_loss_mlp": 0.01490599, + "balance_loss_clip": 1.2257303, + "balance_loss_mlp": 1.12305319, + "epoch": 0.15222749954908915, + "flos": 19940246331840.0, + "grad_norm": 2.05612799601332, + "language_loss": 0.78233957, + "learning_rate": 3.8454403792065275e-06, + "loss": 0.81323206, + "num_input_tokens_seen": 26842060, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.67382812, + "step": 1266, + "time_per_iteration": 3.1555492877960205 + }, + { + "auxiliary_loss_clip": 0.01600479, + "auxiliary_loss_mlp": 0.01495587, + "balance_loss_clip": 1.23001504, + "balance_loss_mlp": 1.13509774, + "epoch": 0.15234774243972826, + "flos": 21326344783200.0, + "grad_norm": 2.734130716490991, + "language_loss": 0.85602868, + "learning_rate": 3.845139968703068e-06, + "loss": 0.8869893, + "num_input_tokens_seen": 26859580, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.60742188, + "step": 1267, + "time_per_iteration": 3.006334066390991 + }, + { + "auxiliary_loss_clip": 0.01601974, + "auxiliary_loss_mlp": 0.01496845, + "balance_loss_clip": 1.23065627, + "balance_loss_mlp": 1.13788188, + "epoch": 0.15246798533036734, + "flos": 25960188143520.0, + "grad_norm": 3.8280466144641925, + "language_loss": 0.82912946, + "learning_rate": 3.844839278293342e-06, + "loss": 0.86011767, + "num_input_tokens_seen": 26880430, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.59375, + "step": 1268, + "time_per_iteration": 3.0886237621307373 + }, + { + "auxiliary_loss_clip": 0.01597233, + "auxiliary_loss_mlp": 0.01482318, + "balance_loss_clip": 1.2266171, + "balance_loss_mlp": 1.12125731, + "epoch": 0.15258822822100643, + "flos": 25814277054720.0, + "grad_norm": 2.4789595557783297, + "language_loss": 0.7750082, + "learning_rate": 3.8445383080229654e-06, + "loss": 0.80580378, + "num_input_tokens_seen": 26896445, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 3.61132812, + "step": 1269, + "time_per_iteration": 3.117337226867676 + }, + { + "auxiliary_loss_clip": 0.01592331, + "auxiliary_loss_mlp": 0.01483477, + "balance_loss_clip": 1.22099447, + "balance_loss_mlp": 1.12298846, + "epoch": 0.1527084711116455, + "flos": 25267840588800.0, + "grad_norm": 2.7372248804721053, + "language_loss": 0.73826551, + "learning_rate": 3.844237057937593e-06, + "loss": 0.76902354, + "num_input_tokens_seen": 26915450, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.6015625, + "step": 1270, + "time_per_iteration": 3.0166585445404053 + }, + { + "auxiliary_loss_clip": 0.01592438, + "auxiliary_loss_mlp": 0.01505449, + "balance_loss_clip": 1.21957636, + "balance_loss_mlp": 1.14610505, + "epoch": 0.15282871400228462, + "flos": 29242713539520.0, + "grad_norm": 2.7372667143221374, + "language_loss": 0.77668458, + "learning_rate": 3.843935528082926e-06, + "loss": 0.80766344, + "num_input_tokens_seen": 26936475, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.59375, + "step": 1271, + "time_per_iteration": 3.0232338905334473 + }, + { + "auxiliary_loss_clip": 0.0158904, + "auxiliary_loss_mlp": 0.01493505, + "balance_loss_clip": 1.21795487, + "balance_loss_mlp": 1.13702178, + "epoch": 0.1529489568929237, + "flos": 20884856628960.0, + "grad_norm": 2.054816310172929, + "language_loss": 0.85016382, + "learning_rate": 3.843633718504704e-06, + "loss": 0.88098925, + "num_input_tokens_seen": 26954920, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.56445312, + "step": 1272, + "time_per_iteration": 3.0112528800964355 + }, + { + "auxiliary_loss_clip": 0.0159369, + "auxiliary_loss_mlp": 0.01491043, + "balance_loss_clip": 1.22333598, + "balance_loss_mlp": 1.13455927, + "epoch": 0.1530691997835628, + "flos": 20085891923520.0, + "grad_norm": 2.5341575644647625, + "language_loss": 0.90333724, + "learning_rate": 3.843331629248715e-06, + "loss": 0.93418467, + "num_input_tokens_seen": 26972520, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.56640625, + "step": 1273, + "time_per_iteration": 2.9784693717956543 + }, + { + "auxiliary_loss_clip": 0.01594598, + "auxiliary_loss_mlp": 0.01494177, + "balance_loss_clip": 1.22230518, + "balance_loss_mlp": 1.13693047, + "epoch": 0.1531894426742019, + "flos": 28761931447200.0, + "grad_norm": 2.7768972040078697, + "language_loss": 0.76946908, + "learning_rate": 3.843029260360782e-06, + "loss": 0.80035686, + "num_input_tokens_seen": 26990890, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.57421875, + "step": 1274, + "time_per_iteration": 3.0879130363464355 + }, + { + "auxiliary_loss_clip": 0.01597483, + "auxiliary_loss_mlp": 0.01491343, + "balance_loss_clip": 1.22601986, + "balance_loss_mlp": 1.13409698, + "epoch": 0.15330968556484098, + "flos": 22238450282880.0, + "grad_norm": 1.8848137146171906, + "language_loss": 0.79190838, + "learning_rate": 3.8427266118867755e-06, + "loss": 0.8227967, + "num_input_tokens_seen": 27010640, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.57226562, + "step": 1275, + "time_per_iteration": 3.0197153091430664 + }, + { + "auxiliary_loss_clip": 0.01590415, + "auxiliary_loss_mlp": 0.01481653, + "balance_loss_clip": 1.21812201, + "balance_loss_mlp": 1.12288094, + "epoch": 0.15342992845548006, + "flos": 27530012423520.0, + "grad_norm": 2.66941560577316, + "language_loss": 0.82700372, + "learning_rate": 3.842423683872608e-06, + "loss": 0.85772437, + "num_input_tokens_seen": 27031215, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.59179688, + "step": 1276, + "time_per_iteration": 3.024496078491211 + }, + { + "auxiliary_loss_clip": 0.01594129, + "auxiliary_loss_mlp": 0.01503048, + "balance_loss_clip": 1.22047603, + "balance_loss_mlp": 1.14141488, + "epoch": 0.15355017134611917, + "flos": 19611633474720.0, + "grad_norm": 2.596196570561747, + "language_loss": 0.78118736, + "learning_rate": 3.842120476364232e-06, + "loss": 0.81215918, + "num_input_tokens_seen": 27049665, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.61523438, + "step": 1277, + "time_per_iteration": 2.9847850799560547 + }, + { + "auxiliary_loss_clip": 0.01584809, + "auxiliary_loss_mlp": 0.01471627, + "balance_loss_clip": 1.21190464, + "balance_loss_mlp": 1.10846794, + "epoch": 0.15367041423675826, + "flos": 18480566521440.0, + "grad_norm": 2.224018130789464, + "language_loss": 0.84089637, + "learning_rate": 3.841816989407644e-06, + "loss": 0.8714608, + "num_input_tokens_seen": 27065155, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.62695312, + "step": 1278, + "time_per_iteration": 3.0249600410461426 + }, + { + "auxiliary_loss_clip": 0.01588257, + "auxiliary_loss_mlp": 0.01482167, + "balance_loss_clip": 1.21387243, + "balance_loss_mlp": 1.11500287, + "epoch": 0.15379065712739734, + "flos": 41430897725760.0, + "grad_norm": 3.2170299761804344, + "language_loss": 0.77485049, + "learning_rate": 3.841513223048884e-06, + "loss": 0.80555475, + "num_input_tokens_seen": 27085840, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.66992188, + "step": 1279, + "time_per_iteration": 4.035502910614014 + }, + { + "auxiliary_loss_clip": 0.01592909, + "auxiliary_loss_mlp": 0.01471235, + "balance_loss_clip": 1.21894121, + "balance_loss_mlp": 1.10693169, + "epoch": 0.15391090001803642, + "flos": 22056393293280.0, + "grad_norm": 3.524440434522951, + "language_loss": 0.78861499, + "learning_rate": 3.841209177334031e-06, + "loss": 0.81925642, + "num_input_tokens_seen": 27104200, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.63867188, + "step": 1280, + "time_per_iteration": 3.150789260864258 + }, + { + "auxiliary_loss_clip": 0.01580351, + "auxiliary_loss_mlp": 0.01482458, + "balance_loss_clip": 1.20610857, + "balance_loss_mlp": 1.1183449, + "epoch": 0.15403114290867553, + "flos": 15452162347680.0, + "grad_norm": 1.939086633608575, + "language_loss": 0.7503252, + "learning_rate": 3.84090485230921e-06, + "loss": 0.78095329, + "num_input_tokens_seen": 27122440, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.640625, + "step": 1281, + "time_per_iteration": 3.066580057144165 + }, + { + "auxiliary_loss_clip": 0.01591888, + "auxiliary_loss_mlp": 0.01481054, + "balance_loss_clip": 1.21896553, + "balance_loss_mlp": 1.11427116, + "epoch": 0.15415138579931462, + "flos": 17930982018240.0, + "grad_norm": 3.3475965691618494, + "language_loss": 0.76531851, + "learning_rate": 3.840600248020588e-06, + "loss": 0.79604793, + "num_input_tokens_seen": 27139380, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.66601562, + "step": 1282, + "time_per_iteration": 3.106626510620117 + }, + { + "auxiliary_loss_clip": 0.01581115, + "auxiliary_loss_mlp": 0.01468067, + "balance_loss_clip": 1.20652223, + "balance_loss_mlp": 1.09632516, + "epoch": 0.1542716286899537, + "flos": 11430485683200.0, + "grad_norm": 2.842071642861171, + "language_loss": 0.7990905, + "learning_rate": 3.840295364514371e-06, + "loss": 0.82958233, + "num_input_tokens_seen": 27156760, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.71289062, + "step": 1283, + "time_per_iteration": 3.0431172847747803 + }, + { + "auxiliary_loss_clip": 0.0158801, + "auxiliary_loss_mlp": 0.01476791, + "balance_loss_clip": 1.21388686, + "balance_loss_mlp": 1.10752845, + "epoch": 0.1543918715805928, + "flos": 17422246507680.0, + "grad_norm": 3.267475634909157, + "language_loss": 0.79317313, + "learning_rate": 3.83999020183681e-06, + "loss": 0.82382119, + "num_input_tokens_seen": 27175455, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.69140625, + "step": 1284, + "time_per_iteration": 3.9010119438171387 + }, + { + "auxiliary_loss_clip": 0.01583199, + "auxiliary_loss_mlp": 0.01490403, + "balance_loss_clip": 1.20883965, + "balance_loss_mlp": 1.1203773, + "epoch": 0.1545121144712319, + "flos": 17788370679360.0, + "grad_norm": 2.110109947665473, + "language_loss": 0.78628099, + "learning_rate": 3.839684760034199e-06, + "loss": 0.81701696, + "num_input_tokens_seen": 27193660, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.69726562, + "step": 1285, + "time_per_iteration": 3.900618076324463 + }, + { + "auxiliary_loss_clip": 0.01587788, + "auxiliary_loss_mlp": 0.01480534, + "balance_loss_clip": 1.21127009, + "balance_loss_mlp": 1.10688472, + "epoch": 0.15463235736187098, + "flos": 28222701331680.0, + "grad_norm": 3.446791159364883, + "language_loss": 0.65420008, + "learning_rate": 3.8393790391528716e-06, + "loss": 0.68488324, + "num_input_tokens_seen": 27214355, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.73242188, + "step": 1286, + "time_per_iteration": 3.0152506828308105 + }, + { + "auxiliary_loss_clip": 0.01575794, + "auxiliary_loss_mlp": 0.01463493, + "balance_loss_clip": 1.20094752, + "balance_loss_mlp": 1.08736384, + "epoch": 0.15475260025251006, + "flos": 22859112886560.0, + "grad_norm": 4.363679937727886, + "language_loss": 0.89472187, + "learning_rate": 3.8390730392392075e-06, + "loss": 0.92511481, + "num_input_tokens_seen": 27234335, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.7578125, + "step": 1287, + "time_per_iteration": 3.7246131896972656 + }, + { + "auxiliary_loss_clip": 0.01578888, + "auxiliary_loss_mlp": 0.0147343, + "balance_loss_clip": 1.20403337, + "balance_loss_mlp": 1.09863615, + "epoch": 0.15487284314314917, + "flos": 17604796563360.0, + "grad_norm": 2.6672056549029217, + "language_loss": 0.79585487, + "learning_rate": 3.838766760339626e-06, + "loss": 0.82637811, + "num_input_tokens_seen": 27252860, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.74414062, + "step": 1288, + "time_per_iteration": 3.0522093772888184 + }, + { + "auxiliary_loss_clip": 0.01582519, + "auxiliary_loss_mlp": 0.01465012, + "balance_loss_clip": 1.20646715, + "balance_loss_mlp": 1.09231567, + "epoch": 0.15499308603378825, + "flos": 20083995515520.0, + "grad_norm": 2.8424172580521123, + "language_loss": 0.80003434, + "learning_rate": 3.838460202500587e-06, + "loss": 0.83050966, + "num_input_tokens_seen": 27268650, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.72265625, + "step": 1289, + "time_per_iteration": 2.9359214305877686 + }, + { + "auxiliary_loss_clip": 0.01589062, + "auxiliary_loss_mlp": 0.01467349, + "balance_loss_clip": 1.2122612, + "balance_loss_mlp": 1.09179187, + "epoch": 0.15511332892442733, + "flos": 15919517871360.0, + "grad_norm": 2.386989162291466, + "language_loss": 0.74600679, + "learning_rate": 3.838153365768599e-06, + "loss": 0.77657092, + "num_input_tokens_seen": 27285160, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.75195312, + "step": 1290, + "time_per_iteration": 2.921861171722412 + }, + { + "auxiliary_loss_clip": 0.01592037, + "auxiliary_loss_mlp": 0.01469794, + "balance_loss_clip": 1.21499288, + "balance_loss_mlp": 1.0932833, + "epoch": 0.15523357181506645, + "flos": 41285745200160.0, + "grad_norm": 2.908160671698617, + "language_loss": 0.75785697, + "learning_rate": 3.837846250190206e-06, + "loss": 0.78847522, + "num_input_tokens_seen": 27308025, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.76171875, + "step": 1291, + "time_per_iteration": 3.204268217086792 + }, + { + "auxiliary_loss_clip": 0.01587885, + "auxiliary_loss_mlp": 0.01464376, + "balance_loss_clip": 1.21109557, + "balance_loss_mlp": 1.09511304, + "epoch": 0.15535381470570553, + "flos": 18480756162240.0, + "grad_norm": 2.826753637962366, + "language_loss": 0.77011651, + "learning_rate": 3.837538855811998e-06, + "loss": 0.80063915, + "num_input_tokens_seen": 27326200, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.69140625, + "step": 1292, + "time_per_iteration": 3.0719308853149414 + }, + { + "auxiliary_loss_clip": 0.01582934, + "auxiliary_loss_mlp": 0.01467504, + "balance_loss_clip": 1.2064693, + "balance_loss_mlp": 1.09919477, + "epoch": 0.1554740575963446, + "flos": 13919887310400.0, + "grad_norm": 2.187494217779871, + "language_loss": 0.71525526, + "learning_rate": 3.837231182680606e-06, + "loss": 0.74575961, + "num_input_tokens_seen": 27344165, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.68164062, + "step": 1293, + "time_per_iteration": 2.963303327560425 + }, + { + "auxiliary_loss_clip": 0.01575566, + "auxiliary_loss_mlp": 0.01470251, + "balance_loss_clip": 1.19808054, + "balance_loss_mlp": 1.09946215, + "epoch": 0.1555943004869837, + "flos": 20849014153440.0, + "grad_norm": 2.8499549097759953, + "language_loss": 0.76288533, + "learning_rate": 3.836923230842706e-06, + "loss": 0.79334354, + "num_input_tokens_seen": 27363280, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.70507812, + "step": 1294, + "time_per_iteration": 3.0070531368255615 + }, + { + "auxiliary_loss_clip": 0.0157115, + "auxiliary_loss_mlp": 0.01471755, + "balance_loss_clip": 1.19328368, + "balance_loss_mlp": 1.10478103, + "epoch": 0.1557145433776228, + "flos": 22087798174080.0, + "grad_norm": 2.7694283918003175, + "language_loss": 0.80892372, + "learning_rate": 3.836615000345011e-06, + "loss": 0.83935273, + "num_input_tokens_seen": 27381460, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.66601562, + "step": 1295, + "time_per_iteration": 2.984344959259033 + }, + { + "auxiliary_loss_clip": 0.01572787, + "auxiliary_loss_mlp": 0.01452076, + "balance_loss_clip": 1.19538307, + "balance_loss_mlp": 1.0793798, + "epoch": 0.1558347862682619, + "flos": 19794031817760.0, + "grad_norm": 2.9574265240513458, + "language_loss": 0.78070033, + "learning_rate": 3.836306491234282e-06, + "loss": 0.81094885, + "num_input_tokens_seen": 27399310, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.72265625, + "step": 1296, + "time_per_iteration": 3.0517330169677734 + }, + { + "auxiliary_loss_clip": 0.01581744, + "auxiliary_loss_mlp": 0.01465069, + "balance_loss_clip": 1.2034781, + "balance_loss_mlp": 1.09714162, + "epoch": 0.15595502915890097, + "flos": 17238937888800.0, + "grad_norm": 2.3475277441334796, + "language_loss": 0.75829208, + "learning_rate": 3.835997703557317e-06, + "loss": 0.78876024, + "num_input_tokens_seen": 27416050, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.67578125, + "step": 1297, + "time_per_iteration": 3.027111053466797 + }, + { + "auxiliary_loss_clip": 0.01579045, + "auxiliary_loss_mlp": 0.01460968, + "balance_loss_clip": 1.20279598, + "balance_loss_mlp": 1.09361231, + "epoch": 0.15607527204954008, + "flos": 19721588303520.0, + "grad_norm": 1.8790261755206128, + "language_loss": 0.80580485, + "learning_rate": 3.83568863736096e-06, + "loss": 0.83620501, + "num_input_tokens_seen": 27434920, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.66992188, + "step": 1298, + "time_per_iteration": 3.003077507019043 + }, + { + "auxiliary_loss_clip": 0.01576127, + "auxiliary_loss_mlp": 0.01470554, + "balance_loss_clip": 1.19610226, + "balance_loss_mlp": 1.10129166, + "epoch": 0.15619551494017916, + "flos": 18517660626240.0, + "grad_norm": 5.4693433881696665, + "language_loss": 0.89716923, + "learning_rate": 3.8353792926920975e-06, + "loss": 0.92763603, + "num_input_tokens_seen": 27453570, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 3.68945312, + "step": 1299, + "time_per_iteration": 3.1044158935546875 + }, + { + "auxiliary_loss_clip": 0.01576594, + "auxiliary_loss_mlp": 0.01484762, + "balance_loss_clip": 1.19855809, + "balance_loss_mlp": 1.11988616, + "epoch": 0.15631575783081825, + "flos": 19904214215520.0, + "grad_norm": 3.1295074150005506, + "language_loss": 0.82034147, + "learning_rate": 3.835069669597655e-06, + "loss": 0.85095513, + "num_input_tokens_seen": 27471960, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.6484375, + "step": 1300, + "time_per_iteration": 2.9390509128570557 + }, + { + "auxiliary_loss_clip": 0.01575233, + "auxiliary_loss_mlp": 0.01485224, + "balance_loss_clip": 1.19757473, + "balance_loss_mlp": 1.12511635, + "epoch": 0.15643600072145733, + "flos": 20779794532800.0, + "grad_norm": 2.542765577265865, + "language_loss": 0.79636812, + "learning_rate": 3.834759768124603e-06, + "loss": 0.82697272, + "num_input_tokens_seen": 27490835, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.6015625, + "step": 1301, + "time_per_iteration": 3.054126262664795 + }, + { + "auxiliary_loss_clip": 0.0157299, + "auxiliary_loss_mlp": 0.01479507, + "balance_loss_clip": 1.19628477, + "balance_loss_mlp": 1.11272359, + "epoch": 0.15655624361209644, + "flos": 18548079374880.0, + "grad_norm": 2.3251985084466607, + "language_loss": 0.76463699, + "learning_rate": 3.834449588319953e-06, + "loss": 0.79516196, + "num_input_tokens_seen": 27508870, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.6640625, + "step": 1302, + "time_per_iteration": 3.0218896865844727 + }, + { + "auxiliary_loss_clip": 0.01578318, + "auxiliary_loss_mlp": 0.01482579, + "balance_loss_clip": 1.20141912, + "balance_loss_mlp": 1.11522353, + "epoch": 0.15667648650273552, + "flos": 25231960185120.0, + "grad_norm": 2.0489923137787067, + "language_loss": 0.85042769, + "learning_rate": 3.834139130230758e-06, + "loss": 0.88103664, + "num_input_tokens_seen": 27528175, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.66992188, + "step": 1303, + "time_per_iteration": 3.0649611949920654 + }, + { + "auxiliary_loss_clip": 0.01562377, + "auxiliary_loss_mlp": 0.01460571, + "balance_loss_clip": 1.18476057, + "balance_loss_mlp": 1.08653951, + "epoch": 0.1567967293933746, + "flos": 24829424615520.0, + "grad_norm": 2.0357776118627573, + "language_loss": 0.81059039, + "learning_rate": 3.833828393904117e-06, + "loss": 0.84081984, + "num_input_tokens_seen": 27548455, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.734375, + "step": 1304, + "time_per_iteration": 3.103414535522461 + }, + { + "auxiliary_loss_clip": 0.01568943, + "auxiliary_loss_mlp": 0.01478776, + "balance_loss_clip": 1.19231164, + "balance_loss_mlp": 1.1087501, + "epoch": 0.15691697228401372, + "flos": 19166162863680.0, + "grad_norm": 2.3273564206584227, + "language_loss": 0.77695549, + "learning_rate": 3.833517379387165e-06, + "loss": 0.80743265, + "num_input_tokens_seen": 27564910, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 3.69726562, + "step": 1305, + "time_per_iteration": 3.005007028579712 + }, + { + "auxiliary_loss_clip": 0.0157963, + "auxiliary_loss_mlp": 0.01481727, + "balance_loss_clip": 1.20201802, + "balance_loss_mlp": 1.10960364, + "epoch": 0.1570372151746528, + "flos": 24793278714720.0, + "grad_norm": 38.84610804302011, + "language_loss": 0.8916325, + "learning_rate": 3.833206086727085e-06, + "loss": 0.9222461, + "num_input_tokens_seen": 27584260, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.71875, + "step": 1306, + "time_per_iteration": 3.092651605606079 + }, + { + "auxiliary_loss_clip": 0.0156935, + "auxiliary_loss_mlp": 0.01470756, + "balance_loss_clip": 1.19152164, + "balance_loss_mlp": 1.09519923, + "epoch": 0.15715745806529188, + "flos": 24865987726080.0, + "grad_norm": 2.364486236817296, + "language_loss": 0.70443392, + "learning_rate": 3.8328945159710994e-06, + "loss": 0.73483497, + "num_input_tokens_seen": 27604440, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.75, + "step": 1307, + "time_per_iteration": 3.936147928237915 + }, + { + "auxiliary_loss_clip": 0.01579367, + "auxiliary_loss_mlp": 0.01476284, + "balance_loss_clip": 1.20337903, + "balance_loss_mlp": 1.11045492, + "epoch": 0.157277700955931, + "flos": 21874336303680.0, + "grad_norm": 2.6397893410183793, + "language_loss": 0.88997257, + "learning_rate": 3.832582667166473e-06, + "loss": 0.92052913, + "num_input_tokens_seen": 27624250, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.65429688, + "step": 1308, + "time_per_iteration": 3.09395170211792 + }, + { + "auxiliary_loss_clip": 0.01573461, + "auxiliary_loss_mlp": 0.01461559, + "balance_loss_clip": 1.19696462, + "balance_loss_mlp": 1.09077036, + "epoch": 0.15739794384657008, + "flos": 24535554317280.0, + "grad_norm": 2.584440279634308, + "language_loss": 0.81528378, + "learning_rate": 3.8322705403605125e-06, + "loss": 0.84563398, + "num_input_tokens_seen": 27644595, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.703125, + "step": 1309, + "time_per_iteration": 3.215083122253418 + }, + { + "auxiliary_loss_clip": 0.01571194, + "auxiliary_loss_mlp": 0.01471386, + "balance_loss_clip": 1.19362414, + "balance_loss_mlp": 1.09716439, + "epoch": 0.15751818673720916, + "flos": 17747066548800.0, + "grad_norm": 2.0974303031133696, + "language_loss": 0.8113395, + "learning_rate": 3.831958135600568e-06, + "loss": 0.84176528, + "num_input_tokens_seen": 27662145, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.73828125, + "step": 1310, + "time_per_iteration": 3.9647250175476074 + }, + { + "auxiliary_loss_clip": 0.01576238, + "auxiliary_loss_mlp": 0.01461057, + "balance_loss_clip": 1.19966054, + "balance_loss_mlp": 1.09122157, + "epoch": 0.15763842962784824, + "flos": 17860472840160.0, + "grad_norm": 2.5531815878229382, + "language_loss": 0.79504484, + "learning_rate": 3.831645452934032e-06, + "loss": 0.82541776, + "num_input_tokens_seen": 27680575, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.69726562, + "step": 1311, + "time_per_iteration": 2.9268014430999756 + }, + { + "auxiliary_loss_clip": 0.01581981, + "auxiliary_loss_mlp": 0.01459567, + "balance_loss_clip": 1.20628643, + "balance_loss_mlp": 1.08305645, + "epoch": 0.15775867251848735, + "flos": 26983424244960.0, + "grad_norm": 2.7354572361442013, + "language_loss": 0.80077982, + "learning_rate": 3.831332492408336e-06, + "loss": 0.83119529, + "num_input_tokens_seen": 27701985, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.75976562, + "step": 1312, + "time_per_iteration": 3.9509549140930176 + }, + { + "auxiliary_loss_clip": 0.01569276, + "auxiliary_loss_mlp": 0.01472795, + "balance_loss_clip": 1.192451, + "balance_loss_mlp": 1.1027689, + "epoch": 0.15787891540912644, + "flos": 19242171624960.0, + "grad_norm": 2.00243156838988, + "language_loss": 0.69181103, + "learning_rate": 3.831019254070957e-06, + "loss": 0.72223175, + "num_input_tokens_seen": 27719770, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.69921875, + "step": 1313, + "time_per_iteration": 2.9551773071289062 + }, + { + "auxiliary_loss_clip": 0.01570803, + "auxiliary_loss_mlp": 0.01467961, + "balance_loss_clip": 1.19426036, + "balance_loss_mlp": 1.10041487, + "epoch": 0.15799915829976552, + "flos": 27273539655360.0, + "grad_norm": 6.425298903357245, + "language_loss": 0.95470905, + "learning_rate": 3.8307057379694135e-06, + "loss": 0.98509669, + "num_input_tokens_seen": 27739105, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.671875, + "step": 1314, + "time_per_iteration": 2.9443914890289307 + }, + { + "auxiliary_loss_clip": 0.01562162, + "auxiliary_loss_mlp": 0.01469519, + "balance_loss_clip": 1.1860224, + "balance_loss_mlp": 1.10178256, + "epoch": 0.15811940119040463, + "flos": 20407222573920.0, + "grad_norm": 5.319539836465522, + "language_loss": 0.82354796, + "learning_rate": 3.830391944151264e-06, + "loss": 0.85386473, + "num_input_tokens_seen": 27754985, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.67382812, + "step": 1315, + "time_per_iteration": 3.8582592010498047 + }, + { + "auxiliary_loss_clip": 0.01568554, + "auxiliary_loss_mlp": 0.01472415, + "balance_loss_clip": 1.19194484, + "balance_loss_mlp": 1.10849309, + "epoch": 0.1582396440810437, + "flos": 32602157972640.0, + "grad_norm": 2.4377652994569052, + "language_loss": 0.67610478, + "learning_rate": 3.830077872664114e-06, + "loss": 0.70651448, + "num_input_tokens_seen": 27776110, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.63671875, + "step": 1316, + "time_per_iteration": 3.0867011547088623 + }, + { + "auxiliary_loss_clip": 0.01569533, + "auxiliary_loss_mlp": 0.01472278, + "balance_loss_clip": 1.19369733, + "balance_loss_mlp": 1.09939158, + "epoch": 0.1583598869716828, + "flos": 33803278966080.0, + "grad_norm": 2.021723246233073, + "language_loss": 0.73003, + "learning_rate": 3.829763523555604e-06, + "loss": 0.7604481, + "num_input_tokens_seen": 27796510, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.72460938, + "step": 1317, + "time_per_iteration": 3.0227718353271484 + }, + { + "auxiliary_loss_clip": 0.01578944, + "auxiliary_loss_mlp": 0.0147584, + "balance_loss_clip": 1.20309365, + "balance_loss_mlp": 1.11344314, + "epoch": 0.15848012986232188, + "flos": 24683665239360.0, + "grad_norm": 5.007501415517678, + "language_loss": 0.77622843, + "learning_rate": 3.829448896873423e-06, + "loss": 0.80677623, + "num_input_tokens_seen": 27815610, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.62304688, + "step": 1318, + "time_per_iteration": 3.0594942569732666 + }, + { + "auxiliary_loss_clip": 0.01581737, + "auxiliary_loss_mlp": 0.01467515, + "balance_loss_clip": 1.20549107, + "balance_loss_mlp": 1.10206723, + "epoch": 0.158600372752961, + "flos": 22604308957440.0, + "grad_norm": 1.910391714222106, + "language_loss": 0.7918663, + "learning_rate": 3.829133992665299e-06, + "loss": 0.82235885, + "num_input_tokens_seen": 27834735, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.65234375, + "step": 1319, + "time_per_iteration": 3.1422770023345947 + }, + { + "auxiliary_loss_clip": 0.01580326, + "auxiliary_loss_mlp": 0.01465513, + "balance_loss_clip": 1.20381904, + "balance_loss_mlp": 1.09949255, + "epoch": 0.15872061564360007, + "flos": 27930424016160.0, + "grad_norm": 3.0034854337058676, + "language_loss": 0.89355379, + "learning_rate": 3.828818810979002e-06, + "loss": 0.92401218, + "num_input_tokens_seen": 27853065, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 3.65625, + "step": 1320, + "time_per_iteration": 3.0963330268859863 + }, + { + "auxiliary_loss_clip": 0.01586969, + "auxiliary_loss_mlp": 0.01470742, + "balance_loss_clip": 1.21033478, + "balance_loss_mlp": 1.10777342, + "epoch": 0.15884085853423915, + "flos": 23699116225440.0, + "grad_norm": 1.9557679570764328, + "language_loss": 0.80612946, + "learning_rate": 3.8285033518623454e-06, + "loss": 0.83670652, + "num_input_tokens_seen": 27873315, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.62890625, + "step": 1321, + "time_per_iteration": 3.066779851913452 + }, + { + "auxiliary_loss_clip": 0.01578481, + "auxiliary_loss_mlp": 0.01465852, + "balance_loss_clip": 1.20302653, + "balance_loss_mlp": 1.0944916, + "epoch": 0.15896110142487826, + "flos": 23114599522560.0, + "grad_norm": 2.4938030106303337, + "language_loss": 0.81847256, + "learning_rate": 3.8281876153631845e-06, + "loss": 0.84891587, + "num_input_tokens_seen": 27890070, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.70898438, + "step": 1322, + "time_per_iteration": 2.9906437397003174 + }, + { + "auxiliary_loss_clip": 0.01584092, + "auxiliary_loss_mlp": 0.01455058, + "balance_loss_clip": 1.20682931, + "balance_loss_mlp": 1.09208918, + "epoch": 0.15908134431551735, + "flos": 14687067853440.0, + "grad_norm": 2.1258814822511565, + "language_loss": 0.64754975, + "learning_rate": 3.827871601529416e-06, + "loss": 0.6779412, + "num_input_tokens_seen": 27908590, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.62890625, + "step": 1323, + "time_per_iteration": 3.0017783641815186 + }, + { + "auxiliary_loss_clip": 0.01579256, + "auxiliary_loss_mlp": 0.01468659, + "balance_loss_clip": 1.20246232, + "balance_loss_mlp": 1.10473657, + "epoch": 0.15920158720615643, + "flos": 20195695039680.0, + "grad_norm": 1.6988222695023298, + "language_loss": 0.80717236, + "learning_rate": 3.827555310408979e-06, + "loss": 0.83765149, + "num_input_tokens_seen": 27927985, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.63867188, + "step": 1324, + "time_per_iteration": 3.032170534133911 + }, + { + "auxiliary_loss_clip": 0.01574745, + "auxiliary_loss_mlp": 0.01463057, + "balance_loss_clip": 1.19731998, + "balance_loss_mlp": 1.09341335, + "epoch": 0.1593218300967955, + "flos": 24828931549440.0, + "grad_norm": 2.604206102280643, + "language_loss": 0.82939422, + "learning_rate": 3.827238742049854e-06, + "loss": 0.85977221, + "num_input_tokens_seen": 27948280, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.6953125, + "step": 1325, + "time_per_iteration": 3.030344247817993 + }, + { + "auxiliary_loss_clip": 0.01573679, + "auxiliary_loss_mlp": 0.01478716, + "balance_loss_clip": 1.19608366, + "balance_loss_mlp": 1.10220563, + "epoch": 0.15944207298743462, + "flos": 28331176962240.0, + "grad_norm": 6.503001543950956, + "language_loss": 0.51784742, + "learning_rate": 3.826921896500066e-06, + "loss": 0.54837137, + "num_input_tokens_seen": 27969565, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.75976562, + "step": 1326, + "time_per_iteration": 3.1358675956726074 + }, + { + "auxiliary_loss_clip": 0.01580496, + "auxiliary_loss_mlp": 0.01470916, + "balance_loss_clip": 1.20333564, + "balance_loss_mlp": 1.09993672, + "epoch": 0.1595623158780737, + "flos": 22966981666560.0, + "grad_norm": 2.1584885501417572, + "language_loss": 0.78299326, + "learning_rate": 3.826604773807678e-06, + "loss": 0.81350744, + "num_input_tokens_seen": 27987540, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 3.70507812, + "step": 1327, + "time_per_iteration": 3.0620131492614746 + }, + { + "auxiliary_loss_clip": 0.015823, + "auxiliary_loss_mlp": 0.0146363, + "balance_loss_clip": 1.206429, + "balance_loss_mlp": 1.1002804, + "epoch": 0.1596825587687128, + "flos": 19712106263520.0, + "grad_norm": 2.4835931813379344, + "language_loss": 0.72955835, + "learning_rate": 3.826287374020798e-06, + "loss": 0.76001763, + "num_input_tokens_seen": 28002345, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.63085938, + "step": 1328, + "time_per_iteration": 2.9986367225646973 + }, + { + "auxiliary_loss_clip": 0.01585955, + "auxiliary_loss_mlp": 0.01468862, + "balance_loss_clip": 1.20899606, + "balance_loss_mlp": 1.09788263, + "epoch": 0.1598028016593519, + "flos": 22639923864000.0, + "grad_norm": 2.02769933886279, + "language_loss": 0.81977087, + "learning_rate": 3.825969697187575e-06, + "loss": 0.85031903, + "num_input_tokens_seen": 28021675, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.70703125, + "step": 1329, + "time_per_iteration": 2.9807040691375732 + }, + { + "auxiliary_loss_clip": 0.01574294, + "auxiliary_loss_mlp": 0.01464057, + "balance_loss_clip": 1.19678414, + "balance_loss_mlp": 1.08811879, + "epoch": 0.15992304454999098, + "flos": 20484862246080.0, + "grad_norm": 2.685059663828838, + "language_loss": 0.69747174, + "learning_rate": 3.8256517433562015e-06, + "loss": 0.72785521, + "num_input_tokens_seen": 28039615, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.75390625, + "step": 1330, + "time_per_iteration": 2.910637378692627 + }, + { + "auxiliary_loss_clip": 0.01584149, + "auxiliary_loss_mlp": 0.01460514, + "balance_loss_clip": 1.20390821, + "balance_loss_mlp": 1.08705497, + "epoch": 0.16004328744063007, + "flos": 17678036568960.0, + "grad_norm": 2.4694337851134245, + "language_loss": 0.91677213, + "learning_rate": 3.82533351257491e-06, + "loss": 0.94721878, + "num_input_tokens_seen": 28057565, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 3.73242188, + "step": 1331, + "time_per_iteration": 2.9686214923858643 + }, + { + "auxiliary_loss_clip": 0.015839, + "auxiliary_loss_mlp": 0.01452946, + "balance_loss_clip": 1.20733619, + "balance_loss_mlp": 1.08654404, + "epoch": 0.16016353033126918, + "flos": 24101082872640.0, + "grad_norm": 4.397173620212604, + "language_loss": 0.889018, + "learning_rate": 3.825015004891975e-06, + "loss": 0.91938645, + "num_input_tokens_seen": 28076305, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.6640625, + "step": 1332, + "time_per_iteration": 2.9889204502105713 + }, + { + "auxiliary_loss_clip": 0.01574975, + "auxiliary_loss_mlp": 0.01463899, + "balance_loss_clip": 1.19830537, + "balance_loss_mlp": 1.10112119, + "epoch": 0.16028377322190826, + "flos": 27637236424800.0, + "grad_norm": 2.12206022591817, + "language_loss": 0.76168859, + "learning_rate": 3.824696220355716e-06, + "loss": 0.7920773, + "num_input_tokens_seen": 28097895, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.62695312, + "step": 1333, + "time_per_iteration": 2.9754936695098877 + }, + { + "auxiliary_loss_clip": 0.01575197, + "auxiliary_loss_mlp": 0.01449656, + "balance_loss_clip": 1.19728017, + "balance_loss_mlp": 1.07810473, + "epoch": 0.16040401611254734, + "flos": 20963406576960.0, + "grad_norm": 2.712125072617652, + "language_loss": 0.7905196, + "learning_rate": 3.824377159014491e-06, + "loss": 0.82076812, + "num_input_tokens_seen": 28118790, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.71289062, + "step": 1334, + "time_per_iteration": 3.019146680831909 + }, + { + "auxiliary_loss_clip": 0.01579222, + "auxiliary_loss_mlp": 0.01465161, + "balance_loss_clip": 1.20216727, + "balance_loss_mlp": 1.09723377, + "epoch": 0.16052425900318643, + "flos": 21248932680000.0, + "grad_norm": 3.2226372629509368, + "language_loss": 0.85382891, + "learning_rate": 3.824057820916702e-06, + "loss": 0.88427269, + "num_input_tokens_seen": 28135995, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.67773438, + "step": 1335, + "time_per_iteration": 3.7823596000671387 + }, + { + "auxiliary_loss_clip": 0.01571211, + "auxiliary_loss_mlp": 0.01456739, + "balance_loss_clip": 1.19378912, + "balance_loss_mlp": 1.09033763, + "epoch": 0.16064450189382554, + "flos": 15525857491200.0, + "grad_norm": 2.171225368921037, + "language_loss": 0.72327179, + "learning_rate": 3.8237382061107904e-06, + "loss": 0.75355124, + "num_input_tokens_seen": 28152715, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.6640625, + "step": 1336, + "time_per_iteration": 3.1473255157470703 + }, + { + "auxiliary_loss_clip": 0.01563811, + "auxiliary_loss_mlp": 0.01460806, + "balance_loss_clip": 1.18451679, + "balance_loss_mlp": 1.09383202, + "epoch": 0.16076474478446462, + "flos": 21180623335200.0, + "grad_norm": 2.4973351475697516, + "language_loss": 0.78817552, + "learning_rate": 3.823418314645243e-06, + "loss": 0.81842172, + "num_input_tokens_seen": 28171590, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 3.66796875, + "step": 1337, + "time_per_iteration": 4.041828393936157 + }, + { + "auxiliary_loss_clip": 0.01579228, + "auxiliary_loss_mlp": 0.01467147, + "balance_loss_clip": 1.20179176, + "balance_loss_mlp": 1.1140964, + "epoch": 0.1608849876751037, + "flos": 18368298074880.0, + "grad_norm": 2.2337895956804585, + "language_loss": 0.75752068, + "learning_rate": 3.823098146568588e-06, + "loss": 0.78798437, + "num_input_tokens_seen": 28191295, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.53515625, + "step": 1338, + "time_per_iteration": 3.1835074424743652 + }, + { + "auxiliary_loss_clip": 0.01577906, + "auxiliary_loss_mlp": 0.01470531, + "balance_loss_clip": 1.20080531, + "balance_loss_mlp": 1.10813487, + "epoch": 0.1610052305657428, + "flos": 29499603517440.0, + "grad_norm": 2.86457000540704, + "language_loss": 0.71401811, + "learning_rate": 3.822777701929394e-06, + "loss": 0.74450248, + "num_input_tokens_seen": 28213120, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.62109375, + "step": 1339, + "time_per_iteration": 3.0146420001983643 + }, + { + "auxiliary_loss_clip": 0.01566225, + "auxiliary_loss_mlp": 0.01464798, + "balance_loss_clip": 1.18949497, + "balance_loss_mlp": 1.10030341, + "epoch": 0.1611254734563819, + "flos": 26800343195040.0, + "grad_norm": 2.686883448849867, + "language_loss": 0.7361877, + "learning_rate": 3.8224569807762714e-06, + "loss": 0.76649797, + "num_input_tokens_seen": 28232440, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.640625, + "step": 1340, + "time_per_iteration": 3.833993673324585 + }, + { + "auxiliary_loss_clip": 0.01576013, + "auxiliary_loss_mlp": 0.01469185, + "balance_loss_clip": 1.19935012, + "balance_loss_mlp": 1.11022174, + "epoch": 0.16124571634702098, + "flos": 22421796829920.0, + "grad_norm": 2.2055320179877076, + "language_loss": 0.76506245, + "learning_rate": 3.822135983157873e-06, + "loss": 0.79551446, + "num_input_tokens_seen": 28251715, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.59179688, + "step": 1341, + "time_per_iteration": 3.009831190109253 + }, + { + "auxiliary_loss_clip": 0.01570192, + "auxiliary_loss_mlp": 0.01464693, + "balance_loss_clip": 1.1926291, + "balance_loss_mlp": 1.10363245, + "epoch": 0.16136595923766006, + "flos": 11000679402240.0, + "grad_norm": 2.6719673608262666, + "language_loss": 0.84189355, + "learning_rate": 3.821814709122896e-06, + "loss": 0.87224245, + "num_input_tokens_seen": 28269765, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.61132812, + "step": 1342, + "time_per_iteration": 3.7915351390838623 + }, + { + "auxiliary_loss_clip": 0.01574462, + "auxiliary_loss_mlp": 0.01461251, + "balance_loss_clip": 1.19612193, + "balance_loss_mlp": 1.10018957, + "epoch": 0.16148620212829917, + "flos": 21217110589440.0, + "grad_norm": 4.514186050373958, + "language_loss": 0.84308648, + "learning_rate": 3.821493158720076e-06, + "loss": 0.8734436, + "num_input_tokens_seen": 28288870, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.61132812, + "step": 1343, + "time_per_iteration": 3.1635751724243164 + }, + { + "auxiliary_loss_clip": 0.01560354, + "auxiliary_loss_mlp": 0.01461153, + "balance_loss_clip": 1.18187261, + "balance_loss_mlp": 1.0939883, + "epoch": 0.16160644501893826, + "flos": 16760052204480.0, + "grad_norm": 4.476110571503891, + "language_loss": 0.73202223, + "learning_rate": 3.821171331998191e-06, + "loss": 0.76223731, + "num_input_tokens_seen": 28305400, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.671875, + "step": 1344, + "time_per_iteration": 3.0424983501434326 + }, + { + "auxiliary_loss_clip": 0.01799643, + "auxiliary_loss_mlp": 0.01427277, + "balance_loss_clip": 1.42612791, + "balance_loss_mlp": 1.11141968, + "epoch": 0.16172668790957734, + "flos": 64451244677760.0, + "grad_norm": 0.722380570539264, + "language_loss": 0.54481274, + "learning_rate": 3.820849229006064e-06, + "loss": 0.57708192, + "num_input_tokens_seen": 28373150, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.15625, + "step": 1345, + "time_per_iteration": 3.548386335372925 + }, + { + "auxiliary_loss_clip": 0.01567241, + "auxiliary_loss_mlp": 0.01467699, + "balance_loss_clip": 1.19034195, + "balance_loss_mlp": 1.10396731, + "epoch": 0.16184693080021645, + "flos": 23259903760800.0, + "grad_norm": 2.50531006950885, + "language_loss": 0.71216059, + "learning_rate": 3.8205268497925564e-06, + "loss": 0.74250996, + "num_input_tokens_seen": 28393620, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.63476562, + "step": 1346, + "time_per_iteration": 3.0420494079589844 + }, + { + "auxiliary_loss_clip": 0.01574887, + "auxiliary_loss_mlp": 0.01466969, + "balance_loss_clip": 1.19738364, + "balance_loss_mlp": 1.10514534, + "epoch": 0.16196717369085553, + "flos": 17452703184480.0, + "grad_norm": 2.4084811014917102, + "language_loss": 0.79116964, + "learning_rate": 3.8202041944065725e-06, + "loss": 0.82158822, + "num_input_tokens_seen": 28409440, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.61523438, + "step": 1347, + "time_per_iteration": 2.9590232372283936 + }, + { + "auxiliary_loss_clip": 0.01572062, + "auxiliary_loss_mlp": 0.01460346, + "balance_loss_clip": 1.19379592, + "balance_loss_mlp": 1.09737754, + "epoch": 0.16208741658149461, + "flos": 23875370206560.0, + "grad_norm": 2.434313447906951, + "language_loss": 0.73483229, + "learning_rate": 3.819881262897061e-06, + "loss": 0.76515639, + "num_input_tokens_seen": 28427575, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.63085938, + "step": 1348, + "time_per_iteration": 3.017632246017456 + }, + { + "auxiliary_loss_clip": 0.01575402, + "auxiliary_loss_mlp": 0.01463368, + "balance_loss_clip": 1.19747818, + "balance_loss_mlp": 1.09849215, + "epoch": 0.1622076594721337, + "flos": 25887175706880.0, + "grad_norm": 3.1966317982768078, + "language_loss": 0.73859417, + "learning_rate": 3.819558055313008e-06, + "loss": 0.76898187, + "num_input_tokens_seen": 28448260, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.64648438, + "step": 1349, + "time_per_iteration": 3.0457494258880615 + }, + { + "auxiliary_loss_clip": 0.0157071, + "auxiliary_loss_mlp": 0.01470162, + "balance_loss_clip": 1.19157851, + "balance_loss_mlp": 1.10147154, + "epoch": 0.1623279023627728, + "flos": 21541703061600.0, + "grad_norm": 1.920318690911984, + "language_loss": 0.7721169, + "learning_rate": 3.819234571703444e-06, + "loss": 0.8025257, + "num_input_tokens_seen": 28467085, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 3.68359375, + "step": 1350, + "time_per_iteration": 3.0752594470977783 + }, + { + "auxiliary_loss_clip": 0.01570464, + "auxiliary_loss_mlp": 0.01443724, + "balance_loss_clip": 1.19243824, + "balance_loss_mlp": 1.07427084, + "epoch": 0.1624481452534119, + "flos": 22087305108000.0, + "grad_norm": 2.261303037871663, + "language_loss": 0.85663104, + "learning_rate": 3.8189108121174435e-06, + "loss": 0.88677293, + "num_input_tokens_seen": 28486850, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.69335938, + "step": 1351, + "time_per_iteration": 3.0349843502044678 + }, + { + "auxiliary_loss_clip": 0.01561861, + "auxiliary_loss_mlp": 0.0146578, + "balance_loss_clip": 1.18340397, + "balance_loss_mlp": 1.09556377, + "epoch": 0.16256838814405097, + "flos": 27089851754880.0, + "grad_norm": 2.0466983273706214, + "language_loss": 0.83701265, + "learning_rate": 3.818586776604118e-06, + "loss": 0.86728907, + "num_input_tokens_seen": 28507490, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.69921875, + "step": 1352, + "time_per_iteration": 2.9527645111083984 + }, + { + "auxiliary_loss_clip": 0.0157172, + "auxiliary_loss_mlp": 0.01454189, + "balance_loss_clip": 1.19387054, + "balance_loss_mlp": 1.08568954, + "epoch": 0.16268863103469008, + "flos": 20122417105920.0, + "grad_norm": 1.938498706855866, + "language_loss": 0.6185472, + "learning_rate": 3.818262465212625e-06, + "loss": 0.64880633, + "num_input_tokens_seen": 28527615, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.68164062, + "step": 1353, + "time_per_iteration": 2.949784278869629 + }, + { + "auxiliary_loss_clip": 0.01574275, + "auxiliary_loss_mlp": 0.01453054, + "balance_loss_clip": 1.19565201, + "balance_loss_mlp": 1.08321881, + "epoch": 0.16280887392532917, + "flos": 18334769217120.0, + "grad_norm": 2.6117781620006824, + "language_loss": 0.77447736, + "learning_rate": 3.817937877992161e-06, + "loss": 0.80475062, + "num_input_tokens_seen": 28544910, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.6953125, + "step": 1354, + "time_per_iteration": 2.938532829284668 + }, + { + "auxiliary_loss_clip": 0.01577286, + "auxiliary_loss_mlp": 0.01445979, + "balance_loss_clip": 1.19803858, + "balance_loss_mlp": 1.07576251, + "epoch": 0.16292911681596825, + "flos": 11875918366080.0, + "grad_norm": 4.185446188033442, + "language_loss": 0.85415602, + "learning_rate": 3.817613014991967e-06, + "loss": 0.88438869, + "num_input_tokens_seen": 28561050, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 3.69921875, + "step": 1355, + "time_per_iteration": 2.9437155723571777 + }, + { + "auxiliary_loss_clip": 0.01566473, + "auxiliary_loss_mlp": 0.01469281, + "balance_loss_clip": 1.18898046, + "balance_loss_mlp": 1.10268879, + "epoch": 0.16304935970660733, + "flos": 26105833735200.0, + "grad_norm": 2.369343066328002, + "language_loss": 0.76636493, + "learning_rate": 3.817287876261323e-06, + "loss": 0.79672253, + "num_input_tokens_seen": 28581385, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.66210938, + "step": 1356, + "time_per_iteration": 3.0270588397979736 + }, + { + "auxiliary_loss_clip": 0.01565959, + "auxiliary_loss_mlp": 0.01445991, + "balance_loss_clip": 1.18663263, + "balance_loss_mlp": 1.07710958, + "epoch": 0.16316960259724644, + "flos": 29354564776320.0, + "grad_norm": 1.8171729416209377, + "language_loss": 0.80116117, + "learning_rate": 3.816962461849553e-06, + "loss": 0.83128071, + "num_input_tokens_seen": 28603255, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 3.6875, + "step": 1357, + "time_per_iteration": 3.0889291763305664 + }, + { + "auxiliary_loss_clip": 0.0157042, + "auxiliary_loss_mlp": 0.01459433, + "balance_loss_clip": 1.19133425, + "balance_loss_mlp": 1.09207797, + "epoch": 0.16328984548788553, + "flos": 20888877013920.0, + "grad_norm": 2.1355371479779808, + "language_loss": 0.84610212, + "learning_rate": 3.8166367718060235e-06, + "loss": 0.87640065, + "num_input_tokens_seen": 28623145, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 3.66992188, + "step": 1358, + "time_per_iteration": 3.0718142986297607 + }, + { + "auxiliary_loss_clip": 0.01569898, + "auxiliary_loss_mlp": 0.01455041, + "balance_loss_clip": 1.19101763, + "balance_loss_mlp": 1.08425283, + "epoch": 0.1634100883785246, + "flos": 18043060824000.0, + "grad_norm": 4.453987146530485, + "language_loss": 0.76320976, + "learning_rate": 3.816310806180139e-06, + "loss": 0.79345918, + "num_input_tokens_seen": 28641555, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.70703125, + "step": 1359, + "time_per_iteration": 3.0547573566436768 + }, + { + "auxiliary_loss_clip": 0.01577042, + "auxiliary_loss_mlp": 0.01455893, + "balance_loss_clip": 1.19924951, + "balance_loss_mlp": 1.08510435, + "epoch": 0.16353033126916372, + "flos": 24574468973760.0, + "grad_norm": 2.3748784801517195, + "language_loss": 0.80998445, + "learning_rate": 3.81598456502135e-06, + "loss": 0.84031379, + "num_input_tokens_seen": 28661575, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.703125, + "step": 1360, + "time_per_iteration": 3.0052671432495117 + }, + { + "auxiliary_loss_clip": 0.01580451, + "auxiliary_loss_mlp": 0.01449612, + "balance_loss_clip": 1.20253479, + "balance_loss_mlp": 1.07729745, + "epoch": 0.1636505741598028, + "flos": 19894466678400.0, + "grad_norm": 2.1353225828352254, + "language_loss": 0.87263578, + "learning_rate": 3.8156580483791455e-06, + "loss": 0.90293646, + "num_input_tokens_seen": 28676765, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.71875, + "step": 1361, + "time_per_iteration": 3.0612528324127197 + }, + { + "auxiliary_loss_clip": 0.01584252, + "auxiliary_loss_mlp": 0.01451455, + "balance_loss_clip": 1.20758963, + "balance_loss_mlp": 1.08390903, + "epoch": 0.16377081705044189, + "flos": 28405137602880.0, + "grad_norm": 2.514089174483825, + "language_loss": 0.7698788, + "learning_rate": 3.815331256303059e-06, + "loss": 0.80023593, + "num_input_tokens_seen": 28696795, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.66992188, + "step": 1362, + "time_per_iteration": 3.85394549369812 + }, + { + "auxiliary_loss_clip": 0.01583616, + "auxiliary_loss_mlp": 0.01438067, + "balance_loss_clip": 1.2063359, + "balance_loss_mlp": 1.07166553, + "epoch": 0.163891059941081, + "flos": 21910216707360.0, + "grad_norm": 2.8599050303896476, + "language_loss": 0.77489585, + "learning_rate": 3.815004188842665e-06, + "loss": 0.80511266, + "num_input_tokens_seen": 28714835, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.66015625, + "step": 1363, + "time_per_iteration": 2.984440803527832 + }, + { + "auxiliary_loss_clip": 0.01575101, + "auxiliary_loss_mlp": 0.01447386, + "balance_loss_clip": 1.19848764, + "balance_loss_mlp": 1.07850504, + "epoch": 0.16401130283172008, + "flos": 26800077697920.0, + "grad_norm": 1.8837451321180982, + "language_loss": 0.79977041, + "learning_rate": 3.814676846047578e-06, + "loss": 0.82999527, + "num_input_tokens_seen": 28735710, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.6875, + "step": 1364, + "time_per_iteration": 3.102381944656372 + }, + { + "auxiliary_loss_clip": 0.01579174, + "auxiliary_loss_mlp": 0.01469755, + "balance_loss_clip": 1.20324624, + "balance_loss_mlp": 1.10621417, + "epoch": 0.16413154572235916, + "flos": 33000066306720.0, + "grad_norm": 2.0330973771549528, + "language_loss": 0.69523573, + "learning_rate": 3.8143492279674565e-06, + "loss": 0.72572494, + "num_input_tokens_seen": 28758405, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.63476562, + "step": 1365, + "time_per_iteration": 4.054987668991089 + }, + { + "auxiliary_loss_clip": 0.01823759, + "auxiliary_loss_mlp": 0.01390732, + "balance_loss_clip": 1.44875991, + "balance_loss_mlp": 1.05809021, + "epoch": 0.16425178861299825, + "flos": 40118797478880.0, + "grad_norm": 0.8399388172475549, + "language_loss": 0.58298749, + "learning_rate": 3.8140213346519997e-06, + "loss": 0.61513239, + "num_input_tokens_seen": 28809000, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.328125, + "step": 1366, + "time_per_iteration": 3.2333247661590576 + }, + { + "auxiliary_loss_clip": 0.01582565, + "auxiliary_loss_mlp": 0.01456996, + "balance_loss_clip": 1.20625198, + "balance_loss_mlp": 1.09002209, + "epoch": 0.16437203150363736, + "flos": 25449859650240.0, + "grad_norm": 3.4643622536597074, + "language_loss": 0.76998037, + "learning_rate": 3.813693166150948e-06, + "loss": 0.80037594, + "num_input_tokens_seen": 28829210, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.66796875, + "step": 1367, + "time_per_iteration": 3.769310712814331 + }, + { + "auxiliary_loss_clip": 0.01572856, + "auxiliary_loss_mlp": 0.01462102, + "balance_loss_clip": 1.19884944, + "balance_loss_mlp": 1.0995146, + "epoch": 0.16449227439427644, + "flos": 23479130711520.0, + "grad_norm": 2.1541784636545125, + "language_loss": 0.85876453, + "learning_rate": 3.813364722514086e-06, + "loss": 0.88911414, + "num_input_tokens_seen": 28847545, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.62695312, + "step": 1368, + "time_per_iteration": 3.0059235095977783 + }, + { + "auxiliary_loss_clip": 0.01576418, + "auxiliary_loss_mlp": 0.01446079, + "balance_loss_clip": 1.20062125, + "balance_loss_mlp": 1.07910526, + "epoch": 0.16461251728491552, + "flos": 13546480932000.0, + "grad_norm": 2.8867845034361377, + "language_loss": 0.80221993, + "learning_rate": 3.8130360037912368e-06, + "loss": 0.83244491, + "num_input_tokens_seen": 28863990, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.66601562, + "step": 1369, + "time_per_iteration": 2.9186038970947266 + }, + { + "auxiliary_loss_clip": 0.01566012, + "auxiliary_loss_mlp": 0.0144291, + "balance_loss_clip": 1.19035244, + "balance_loss_mlp": 1.07784319, + "epoch": 0.16473276017555463, + "flos": 23005554969600.0, + "grad_norm": 2.3266216647424307, + "language_loss": 0.81756854, + "learning_rate": 3.812707010032268e-06, + "loss": 0.8476578, + "num_input_tokens_seen": 28883045, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.65039062, + "step": 1370, + "time_per_iteration": 3.6938982009887695 + }, + { + "auxiliary_loss_clip": 0.01577944, + "auxiliary_loss_mlp": 0.01443494, + "balance_loss_clip": 1.20291317, + "balance_loss_mlp": 1.08205104, + "epoch": 0.16485300306619372, + "flos": 24793013217600.0, + "grad_norm": 2.104565530811103, + "language_loss": 0.79603422, + "learning_rate": 3.8123777412870863e-06, + "loss": 0.82624865, + "num_input_tokens_seen": 28902545, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.61523438, + "step": 1371, + "time_per_iteration": 2.9757120609283447 + }, + { + "auxiliary_loss_clip": 0.01570753, + "auxiliary_loss_mlp": 0.01446659, + "balance_loss_clip": 1.19499826, + "balance_loss_mlp": 1.08655119, + "epoch": 0.1649732459568328, + "flos": 21108672887040.0, + "grad_norm": 2.082348247282019, + "language_loss": 0.78427613, + "learning_rate": 3.812048197605643e-06, + "loss": 0.8144502, + "num_input_tokens_seen": 28921440, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.6015625, + "step": 1372, + "time_per_iteration": 2.973360061645508 + }, + { + "auxiliary_loss_clip": 0.01565508, + "auxiliary_loss_mlp": 0.01453411, + "balance_loss_clip": 1.19070303, + "balance_loss_mlp": 1.0925411, + "epoch": 0.16509348884747188, + "flos": 20268821260800.0, + "grad_norm": 2.225785498784953, + "language_loss": 0.8175475, + "learning_rate": 3.8117183790379277e-06, + "loss": 0.84773672, + "num_input_tokens_seen": 28939890, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.609375, + "step": 1373, + "time_per_iteration": 3.0568861961364746 + }, + { + "auxiliary_loss_clip": 0.01566321, + "auxiliary_loss_mlp": 0.01456325, + "balance_loss_clip": 1.19043183, + "balance_loss_mlp": 1.1046102, + "epoch": 0.165213731738111, + "flos": 11037052872000.0, + "grad_norm": 3.7287731994746625, + "language_loss": 0.93867117, + "learning_rate": 3.811388285633976e-06, + "loss": 0.96889764, + "num_input_tokens_seen": 28955875, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.51953125, + "step": 1374, + "time_per_iteration": 3.0061984062194824 + }, + { + "auxiliary_loss_clip": 0.01575846, + "auxiliary_loss_mlp": 0.01452296, + "balance_loss_clip": 1.20183229, + "balance_loss_mlp": 1.10039043, + "epoch": 0.16533397462875007, + "flos": 29974696385760.0, + "grad_norm": 2.326998975219822, + "language_loss": 0.62427974, + "learning_rate": 3.811057917443861e-06, + "loss": 0.65456116, + "num_input_tokens_seen": 28975140, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.51953125, + "step": 1375, + "time_per_iteration": 3.066108465194702 + }, + { + "auxiliary_loss_clip": 0.01793653, + "auxiliary_loss_mlp": 0.0138176, + "balance_loss_clip": 1.41960049, + "balance_loss_mlp": 1.06285095, + "epoch": 0.16545421751938916, + "flos": 65564067821760.0, + "grad_norm": 0.8686165306213347, + "language_loss": 0.68225086, + "learning_rate": 3.8107272745177e-06, + "loss": 0.71400493, + "num_input_tokens_seen": 29047470, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.1875, + "step": 1376, + "time_per_iteration": 3.7317495346069336 + }, + { + "auxiliary_loss_clip": 0.01571606, + "auxiliary_loss_mlp": 0.01462382, + "balance_loss_clip": 1.19648385, + "balance_loss_mlp": 1.11352789, + "epoch": 0.16557446041002827, + "flos": 22494581697600.0, + "grad_norm": 2.0962542325195903, + "language_loss": 0.78603399, + "learning_rate": 3.8103963569056513e-06, + "loss": 0.81637383, + "num_input_tokens_seen": 29066605, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.49023438, + "step": 1377, + "time_per_iteration": 3.066211462020874 + }, + { + "auxiliary_loss_clip": 0.01571792, + "auxiliary_loss_mlp": 0.01435787, + "balance_loss_clip": 1.19581175, + "balance_loss_mlp": 1.07072031, + "epoch": 0.16569470330066735, + "flos": 24604736009760.0, + "grad_norm": 2.5204051725760284, + "language_loss": 0.87661469, + "learning_rate": 3.8100651646579146e-06, + "loss": 0.90669048, + "num_input_tokens_seen": 29085815, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.64648438, + "step": 1378, + "time_per_iteration": 3.035139560699463 + }, + { + "auxiliary_loss_clip": 0.01575419, + "auxiliary_loss_mlp": 0.01465261, + "balance_loss_clip": 1.19932818, + "balance_loss_mlp": 1.1148808, + "epoch": 0.16581494619130643, + "flos": 15007639940640.0, + "grad_norm": 2.912696204732078, + "language_loss": 0.92828345, + "learning_rate": 3.8097336978247317e-06, + "loss": 0.95869029, + "num_input_tokens_seen": 29102520, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.50585938, + "step": 1379, + "time_per_iteration": 3.0135657787323 + }, + { + "auxiliary_loss_clip": 0.01569777, + "auxiliary_loss_mlp": 0.01460095, + "balance_loss_clip": 1.19371176, + "balance_loss_mlp": 1.10818887, + "epoch": 0.16593518908194552, + "flos": 17422246507680.0, + "grad_norm": 2.2518498451709346, + "language_loss": 0.89153504, + "learning_rate": 3.8094019564563854e-06, + "loss": 0.92183375, + "num_input_tokens_seen": 29119450, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.52148438, + "step": 1380, + "time_per_iteration": 2.9276468753814697 + }, + { + "auxiliary_loss_clip": 0.01560771, + "auxiliary_loss_mlp": 0.01443391, + "balance_loss_clip": 1.18548274, + "balance_loss_mlp": 1.08747959, + "epoch": 0.16605543197258463, + "flos": 20414732349600.0, + "grad_norm": 2.2742985478327062, + "language_loss": 0.75269818, + "learning_rate": 3.809069940603201e-06, + "loss": 0.78273976, + "num_input_tokens_seen": 29137405, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.55859375, + "step": 1381, + "time_per_iteration": 3.0188989639282227 + }, + { + "auxiliary_loss_clip": 0.01570635, + "auxiliary_loss_mlp": 0.01443472, + "balance_loss_clip": 1.19525468, + "balance_loss_mlp": 1.09442687, + "epoch": 0.1661756748632237, + "flos": 14211481919040.0, + "grad_norm": 2.40020546305905, + "language_loss": 0.78216541, + "learning_rate": 3.8087376503155452e-06, + "loss": 0.81230652, + "num_input_tokens_seen": 29154890, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.49414062, + "step": 1382, + "time_per_iteration": 2.9345531463623047 + }, + { + "auxiliary_loss_clip": 0.01773517, + "auxiliary_loss_mlp": 0.01362221, + "balance_loss_clip": 1.39921033, + "balance_loss_mlp": 1.04254913, + "epoch": 0.1662959177538628, + "flos": 66086912972160.0, + "grad_norm": 0.9104231276086171, + "language_loss": 0.5626598, + "learning_rate": 3.808405085643826e-06, + "loss": 0.59401721, + "num_input_tokens_seen": 29219770, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.1953125, + "step": 1383, + "time_per_iteration": 3.53782057762146 + }, + { + "auxiliary_loss_clip": 0.01565493, + "auxiliary_loss_mlp": 0.01438894, + "balance_loss_clip": 1.189731, + "balance_loss_mlp": 1.0806942, + "epoch": 0.1664161606445019, + "flos": 20742662499840.0, + "grad_norm": 3.935144599417393, + "language_loss": 0.88800657, + "learning_rate": 3.8080722466384925e-06, + "loss": 0.91805041, + "num_input_tokens_seen": 29237620, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.58203125, + "step": 1384, + "time_per_iteration": 3.1754000186920166 + }, + { + "auxiliary_loss_clip": 0.01559158, + "auxiliary_loss_mlp": 0.01451725, + "balance_loss_clip": 1.18372464, + "balance_loss_mlp": 1.09543204, + "epoch": 0.166536403535141, + "flos": 25263061640640.0, + "grad_norm": 2.9640443936200644, + "language_loss": 0.70924842, + "learning_rate": 3.8077391333500376e-06, + "loss": 0.73935723, + "num_input_tokens_seen": 29256760, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.56640625, + "step": 1385, + "time_per_iteration": 2.956696033477783 + }, + { + "auxiliary_loss_clip": 0.01568424, + "auxiliary_loss_mlp": 0.01458311, + "balance_loss_clip": 1.19147873, + "balance_loss_mlp": 1.10373497, + "epoch": 0.16665664642578007, + "flos": 25449821722080.0, + "grad_norm": 1.882336541348413, + "language_loss": 0.76920336, + "learning_rate": 3.8074057458289934e-06, + "loss": 0.79947072, + "num_input_tokens_seen": 29277450, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 3.54492188, + "step": 1386, + "time_per_iteration": 3.0127501487731934 + }, + { + "auxiliary_loss_clip": 0.01555024, + "auxiliary_loss_mlp": 0.01456644, + "balance_loss_clip": 1.17914248, + "balance_loss_mlp": 1.09672678, + "epoch": 0.16677688931641918, + "flos": 22202987088960.0, + "grad_norm": 2.3439815084410847, + "language_loss": 0.82607377, + "learning_rate": 3.807072084125934e-06, + "loss": 0.85619044, + "num_input_tokens_seen": 29299300, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.59765625, + "step": 1387, + "time_per_iteration": 3.000147581100464 + }, + { + "auxiliary_loss_clip": 0.01558601, + "auxiliary_loss_mlp": 0.01429795, + "balance_loss_clip": 1.18314242, + "balance_loss_mlp": 1.07121372, + "epoch": 0.16689713220705826, + "flos": 16947836346240.0, + "grad_norm": 2.435469132964383, + "language_loss": 0.80561215, + "learning_rate": 3.806738148291477e-06, + "loss": 0.83549607, + "num_input_tokens_seen": 29316125, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.5859375, + "step": 1388, + "time_per_iteration": 2.9085981845855713 + }, + { + "auxiliary_loss_clip": 0.01559909, + "auxiliary_loss_mlp": 0.01441388, + "balance_loss_clip": 1.18494892, + "balance_loss_mlp": 1.08147132, + "epoch": 0.16701737509769735, + "flos": 36247242293280.0, + "grad_norm": 3.791120765156361, + "language_loss": 0.71613991, + "learning_rate": 3.8064039383762793e-06, + "loss": 0.74615288, + "num_input_tokens_seen": 29338490, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.60351562, + "step": 1389, + "time_per_iteration": 3.0403592586517334 + }, + { + "auxiliary_loss_clip": 0.01559651, + "auxiliary_loss_mlp": 0.01442158, + "balance_loss_clip": 1.18386459, + "balance_loss_mlp": 1.0820508, + "epoch": 0.16713761798833643, + "flos": 23260927821120.0, + "grad_norm": 4.786686542721772, + "language_loss": 0.77191573, + "learning_rate": 3.8060694544310396e-06, + "loss": 0.80193388, + "num_input_tokens_seen": 29357000, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.6015625, + "step": 1390, + "time_per_iteration": 3.762326240539551 + }, + { + "auxiliary_loss_clip": 0.01565311, + "auxiliary_loss_mlp": 0.0143219, + "balance_loss_clip": 1.1886586, + "balance_loss_mlp": 1.06712317, + "epoch": 0.16725786087897554, + "flos": 25304934693600.0, + "grad_norm": 2.0707500140114234, + "language_loss": 0.78807354, + "learning_rate": 3.8057346965065006e-06, + "loss": 0.81804848, + "num_input_tokens_seen": 29378230, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.6484375, + "step": 1391, + "time_per_iteration": 2.973120927810669 + }, + { + "auxiliary_loss_clip": 0.01557444, + "auxiliary_loss_mlp": 0.0142107, + "balance_loss_clip": 1.18179488, + "balance_loss_mlp": 1.0586741, + "epoch": 0.16737810376961462, + "flos": 31834218866400.0, + "grad_norm": 1.7039535463019253, + "language_loss": 0.8442241, + "learning_rate": 3.805399664653443e-06, + "loss": 0.87400919, + "num_input_tokens_seen": 29400370, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.62109375, + "step": 1392, + "time_per_iteration": 3.9856362342834473 + }, + { + "auxiliary_loss_clip": 0.01557924, + "auxiliary_loss_mlp": 0.01420892, + "balance_loss_clip": 1.18318677, + "balance_loss_mlp": 1.0562073, + "epoch": 0.1674983466602537, + "flos": 27964180442880.0, + "grad_norm": 2.650512328109817, + "language_loss": 0.74221921, + "learning_rate": 3.805064358922692e-06, + "loss": 0.77200735, + "num_input_tokens_seen": 29418660, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.64257812, + "step": 1393, + "time_per_iteration": 3.0541138648986816 + }, + { + "auxiliary_loss_clip": 0.01559929, + "auxiliary_loss_mlp": 0.01423788, + "balance_loss_clip": 1.18394446, + "balance_loss_mlp": 1.06081986, + "epoch": 0.16761858955089282, + "flos": 21764836612800.0, + "grad_norm": 3.1149554245012276, + "language_loss": 0.81191051, + "learning_rate": 3.8047287793651136e-06, + "loss": 0.84174776, + "num_input_tokens_seen": 29440105, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.62695312, + "step": 1394, + "time_per_iteration": 3.8201804161071777 + }, + { + "auxiliary_loss_clip": 0.01561623, + "auxiliary_loss_mlp": 0.01425104, + "balance_loss_clip": 1.18652201, + "balance_loss_mlp": 1.05736697, + "epoch": 0.1677388324415319, + "flos": 23807781496800.0, + "grad_norm": 1.8781358052335893, + "language_loss": 0.88705099, + "learning_rate": 3.8043929260316137e-06, + "loss": 0.91691828, + "num_input_tokens_seen": 29458260, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.671875, + "step": 1395, + "time_per_iteration": 3.190167188644409 + }, + { + "auxiliary_loss_clip": 0.01571056, + "auxiliary_loss_mlp": 0.01422657, + "balance_loss_clip": 1.19646347, + "balance_loss_mlp": 1.05186844, + "epoch": 0.16785907533217098, + "flos": 20560984791840.0, + "grad_norm": 2.1059667987801234, + "language_loss": 0.83389086, + "learning_rate": 3.8040567989731417e-06, + "loss": 0.86382794, + "num_input_tokens_seen": 29476205, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.70507812, + "step": 1396, + "time_per_iteration": 3.1055541038513184 + }, + { + "auxiliary_loss_clip": 0.01563132, + "auxiliary_loss_mlp": 0.01435949, + "balance_loss_clip": 1.19002318, + "balance_loss_mlp": 1.0722177, + "epoch": 0.16797931822281006, + "flos": 15671503082880.0, + "grad_norm": 2.4026862836691314, + "language_loss": 0.79600638, + "learning_rate": 3.8037203982406876e-06, + "loss": 0.82599717, + "num_input_tokens_seen": 29494370, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.63867188, + "step": 1397, + "time_per_iteration": 3.837665319442749 + }, + { + "auxiliary_loss_clip": 0.015648, + "auxiliary_loss_mlp": 0.01437438, + "balance_loss_clip": 1.19042253, + "balance_loss_mlp": 1.07790315, + "epoch": 0.16809956111344918, + "flos": 16542759589920.0, + "grad_norm": 3.4200886705524773, + "language_loss": 0.7334621, + "learning_rate": 3.8033837238852835e-06, + "loss": 0.76348442, + "num_input_tokens_seen": 29511070, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.59570312, + "step": 1398, + "time_per_iteration": 3.068100929260254 + }, + { + "auxiliary_loss_clip": 0.01554399, + "auxiliary_loss_mlp": 0.01434103, + "balance_loss_clip": 1.18029404, + "balance_loss_mlp": 1.06865466, + "epoch": 0.16821980400408826, + "flos": 23260320970560.0, + "grad_norm": 2.0440928471774646, + "language_loss": 0.69566536, + "learning_rate": 3.8030467759580017e-06, + "loss": 0.72555035, + "num_input_tokens_seen": 29531990, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.65234375, + "step": 1399, + "time_per_iteration": 3.1579835414886475 + }, + { + "auxiliary_loss_clip": 0.01555164, + "auxiliary_loss_mlp": 0.01440784, + "balance_loss_clip": 1.1818465, + "balance_loss_mlp": 1.07705295, + "epoch": 0.16834004689472734, + "flos": 20776722351840.0, + "grad_norm": 2.017651836421966, + "language_loss": 0.87078512, + "learning_rate": 3.802709554509958e-06, + "loss": 0.90074462, + "num_input_tokens_seen": 29549790, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.63671875, + "step": 1400, + "time_per_iteration": 2.978079319000244 + }, + { + "auxiliary_loss_clip": 0.01556053, + "auxiliary_loss_mlp": 0.01431011, + "balance_loss_clip": 1.18270731, + "balance_loss_mlp": 1.07605314, + "epoch": 0.16846028978536645, + "flos": 26689705659360.0, + "grad_norm": 2.0643727408224737, + "language_loss": 0.79068613, + "learning_rate": 3.8023720595923083e-06, + "loss": 0.82055676, + "num_input_tokens_seen": 29569045, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.55273438, + "step": 1401, + "time_per_iteration": 3.0531370639801025 + }, + { + "auxiliary_loss_clip": 0.01566662, + "auxiliary_loss_mlp": 0.01428009, + "balance_loss_clip": 1.19450974, + "balance_loss_mlp": 1.07419634, + "epoch": 0.16858053267600553, + "flos": 18845287351200.0, + "grad_norm": 7.928518339584555, + "language_loss": 0.87793469, + "learning_rate": 3.80203429125625e-06, + "loss": 0.90788138, + "num_input_tokens_seen": 29587220, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.54296875, + "step": 1402, + "time_per_iteration": 2.998354434967041 + }, + { + "auxiliary_loss_clip": 0.01562135, + "auxiliary_loss_mlp": 0.01426719, + "balance_loss_clip": 1.1890893, + "balance_loss_mlp": 1.06775606, + "epoch": 0.16870077556664462, + "flos": 27746963684640.0, + "grad_norm": 2.102681043481771, + "language_loss": 0.70072281, + "learning_rate": 3.8016962495530225e-06, + "loss": 0.73061138, + "num_input_tokens_seen": 29606410, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.58984375, + "step": 1403, + "time_per_iteration": 3.1916627883911133 + }, + { + "auxiliary_loss_clip": 0.01553835, + "auxiliary_loss_mlp": 0.01419852, + "balance_loss_clip": 1.18037271, + "balance_loss_mlp": 1.06661153, + "epoch": 0.1688210184572837, + "flos": 13732216953120.0, + "grad_norm": 2.6344349016834188, + "language_loss": 0.77124935, + "learning_rate": 3.8013579345339063e-06, + "loss": 0.80098617, + "num_input_tokens_seen": 29621275, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.53515625, + "step": 1404, + "time_per_iteration": 3.0634496212005615 + }, + { + "auxiliary_loss_clip": 0.01561832, + "auxiliary_loss_mlp": 0.0144834, + "balance_loss_clip": 1.1891222, + "balance_loss_mlp": 1.09528947, + "epoch": 0.1689412613479228, + "flos": 26471199343680.0, + "grad_norm": 5.370918308462195, + "language_loss": 0.69510812, + "learning_rate": 3.801019346250224e-06, + "loss": 0.72520983, + "num_input_tokens_seen": 29641420, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.53320312, + "step": 1405, + "time_per_iteration": 3.0315630435943604 + }, + { + "auxiliary_loss_clip": 0.01561077, + "auxiliary_loss_mlp": 0.014375, + "balance_loss_clip": 1.18930006, + "balance_loss_mlp": 1.08235168, + "epoch": 0.1690615042385619, + "flos": 21140798402880.0, + "grad_norm": 2.814819230905066, + "language_loss": 0.83951563, + "learning_rate": 3.8006804847533395e-06, + "loss": 0.86950147, + "num_input_tokens_seen": 29660935, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.5546875, + "step": 1406, + "time_per_iteration": 3.004274845123291 + }, + { + "auxiliary_loss_clip": 0.01562422, + "auxiliary_loss_mlp": 0.01416622, + "balance_loss_clip": 1.19020665, + "balance_loss_mlp": 1.06299925, + "epoch": 0.16918174712920098, + "flos": 20851403627520.0, + "grad_norm": 3.674317908884148, + "language_loss": 0.85797095, + "learning_rate": 3.8003413500946556e-06, + "loss": 0.88776141, + "num_input_tokens_seen": 29681045, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.5390625, + "step": 1407, + "time_per_iteration": 3.0798120498657227 + }, + { + "auxiliary_loss_clip": 0.01567559, + "auxiliary_loss_mlp": 0.01421592, + "balance_loss_clip": 1.19529343, + "balance_loss_mlp": 1.06625283, + "epoch": 0.1693019900198401, + "flos": 16985195948160.0, + "grad_norm": 2.712415724364923, + "language_loss": 0.83773667, + "learning_rate": 3.8000019423256216e-06, + "loss": 0.8676281, + "num_input_tokens_seen": 29698810, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.55664062, + "step": 1408, + "time_per_iteration": 2.960214376449585 + }, + { + "auxiliary_loss_clip": 0.01556752, + "auxiliary_loss_mlp": 0.01419246, + "balance_loss_clip": 1.18515837, + "balance_loss_mlp": 1.07077336, + "epoch": 0.16942223291047917, + "flos": 26799319134720.0, + "grad_norm": 1.8759314290805074, + "language_loss": 0.88271475, + "learning_rate": 3.7996622614977234e-06, + "loss": 0.91247475, + "num_input_tokens_seen": 29720000, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.49023438, + "step": 1409, + "time_per_iteration": 2.9629366397857666 + }, + { + "auxiliary_loss_clip": 0.01568941, + "auxiliary_loss_mlp": 0.01431453, + "balance_loss_clip": 1.19758415, + "balance_loss_mlp": 1.07516003, + "epoch": 0.16954247580111825, + "flos": 18585438976800.0, + "grad_norm": 1.7926026456006137, + "language_loss": 0.79598331, + "learning_rate": 3.799322307662492e-06, + "loss": 0.82598728, + "num_input_tokens_seen": 29737820, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.5625, + "step": 1410, + "time_per_iteration": 3.032566785812378 + }, + { + "auxiliary_loss_clip": 0.0156311, + "auxiliary_loss_mlp": 0.01432858, + "balance_loss_clip": 1.19289029, + "balance_loss_mlp": 1.08209705, + "epoch": 0.16966271869175734, + "flos": 13984972761600.0, + "grad_norm": 2.4217994844031785, + "language_loss": 0.84117323, + "learning_rate": 3.798982080871496e-06, + "loss": 0.87113297, + "num_input_tokens_seen": 29752960, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.50976562, + "step": 1411, + "time_per_iteration": 2.9426381587982178 + }, + { + "auxiliary_loss_clip": 0.01555389, + "auxiliary_loss_mlp": 0.01431981, + "balance_loss_clip": 1.18439198, + "balance_loss_mlp": 1.08064699, + "epoch": 0.16978296158239645, + "flos": 37490350124160.0, + "grad_norm": 5.695799135817193, + "language_loss": 0.67757261, + "learning_rate": 3.798641581176349e-06, + "loss": 0.70744634, + "num_input_tokens_seen": 29775240, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.515625, + "step": 1412, + "time_per_iteration": 3.038214683532715 + }, + { + "auxiliary_loss_clip": 0.01559451, + "auxiliary_loss_mlp": 0.01428017, + "balance_loss_clip": 1.18734813, + "balance_loss_mlp": 1.07878196, + "epoch": 0.16990320447303553, + "flos": 28331442459360.0, + "grad_norm": 11.412076795351018, + "language_loss": 0.74904317, + "learning_rate": 3.7983008086287044e-06, + "loss": 0.77891791, + "num_input_tokens_seen": 29796560, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.49609375, + "step": 1413, + "time_per_iteration": 2.9891233444213867 + }, + { + "auxiliary_loss_clip": 0.01558669, + "auxiliary_loss_mlp": 0.01423241, + "balance_loss_clip": 1.18775702, + "balance_loss_mlp": 1.07152605, + "epoch": 0.1700234473636746, + "flos": 20189967887520.0, + "grad_norm": 2.0964471763166497, + "language_loss": 0.7936697, + "learning_rate": 3.797959763280257e-06, + "loss": 0.82348877, + "num_input_tokens_seen": 29815245, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.51757812, + "step": 1414, + "time_per_iteration": 2.9502546787261963 + }, + { + "auxiliary_loss_clip": 0.01562069, + "auxiliary_loss_mlp": 0.01417822, + "balance_loss_clip": 1.19009018, + "balance_loss_mlp": 1.06973124, + "epoch": 0.17014369025431372, + "flos": 24860715711840.0, + "grad_norm": 2.628885588729065, + "language_loss": 0.79172957, + "learning_rate": 3.797618445182743e-06, + "loss": 0.82152843, + "num_input_tokens_seen": 29836640, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.48242188, + "step": 1415, + "time_per_iteration": 3.1065545082092285 + }, + { + "auxiliary_loss_clip": 0.01558343, + "auxiliary_loss_mlp": 0.01431776, + "balance_loss_clip": 1.18706727, + "balance_loss_mlp": 1.07987022, + "epoch": 0.1702639331449528, + "flos": 16468836877440.0, + "grad_norm": 2.2005590769024876, + "language_loss": 0.85079551, + "learning_rate": 3.79727685438794e-06, + "loss": 0.88069665, + "num_input_tokens_seen": 29850830, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.52539062, + "step": 1416, + "time_per_iteration": 2.972050666809082 + }, + { + "auxiliary_loss_clip": 0.01723542, + "auxiliary_loss_mlp": 0.01384399, + "balance_loss_clip": 1.35788226, + "balance_loss_mlp": 1.08380127, + "epoch": 0.1703841760355919, + "flos": 52514185389120.0, + "grad_norm": 0.8679908047484058, + "language_loss": 0.61622548, + "learning_rate": 3.796934990947667e-06, + "loss": 0.64730489, + "num_input_tokens_seen": 29912515, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.0, + "step": 1417, + "time_per_iteration": 4.2791783809661865 + }, + { + "auxiliary_loss_clip": 0.01718523, + "auxiliary_loss_mlp": 0.01369995, + "balance_loss_clip": 1.35275018, + "balance_loss_mlp": 1.06939697, + "epoch": 0.170504418926231, + "flos": 49375750530240.0, + "grad_norm": 0.8914750808383902, + "language_loss": 0.6240558, + "learning_rate": 3.7965928549137854e-06, + "loss": 0.6549409, + "num_input_tokens_seen": 29969330, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.0, + "step": 1418, + "time_per_iteration": 3.3913846015930176 + }, + { + "auxiliary_loss_clip": 0.01557537, + "auxiliary_loss_mlp": 0.01422225, + "balance_loss_clip": 1.1841104, + "balance_loss_mlp": 1.06936622, + "epoch": 0.17062466181687008, + "flos": 25851636656640.0, + "grad_norm": 2.3620004746661274, + "language_loss": 0.77761674, + "learning_rate": 3.7962504463381953e-06, + "loss": 0.80741441, + "num_input_tokens_seen": 29990820, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.53320312, + "step": 1419, + "time_per_iteration": 4.015071392059326 + }, + { + "auxiliary_loss_clip": 0.01568457, + "auxiliary_loss_mlp": 0.0142864, + "balance_loss_clip": 1.19729388, + "balance_loss_mlp": 1.08112144, + "epoch": 0.17074490470750917, + "flos": 20962572157440.0, + "grad_norm": 1.7226038689641858, + "language_loss": 0.78938729, + "learning_rate": 3.7959077652728412e-06, + "loss": 0.81935823, + "num_input_tokens_seen": 30009275, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.47851562, + "step": 1420, + "time_per_iteration": 3.0913429260253906 + }, + { + "auxiliary_loss_clip": 0.01561973, + "auxiliary_loss_mlp": 0.01429185, + "balance_loss_clip": 1.18884277, + "balance_loss_mlp": 1.0789963, + "epoch": 0.17086514759814825, + "flos": 20961889450560.0, + "grad_norm": 2.380343393476577, + "language_loss": 0.77443135, + "learning_rate": 3.795564811769707e-06, + "loss": 0.80434293, + "num_input_tokens_seen": 30027630, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.5078125, + "step": 1421, + "time_per_iteration": 3.8424699306488037 + }, + { + "auxiliary_loss_clip": 0.01565582, + "auxiliary_loss_mlp": 0.01415092, + "balance_loss_clip": 1.19215834, + "balance_loss_mlp": 1.06375861, + "epoch": 0.17098539048878736, + "flos": 28476519128640.0, + "grad_norm": 4.603715778346735, + "language_loss": 0.78646541, + "learning_rate": 3.795221585880818e-06, + "loss": 0.81627214, + "num_input_tokens_seen": 30048310, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 3.51757812, + "step": 1422, + "time_per_iteration": 3.019585132598877 + }, + { + "auxiliary_loss_clip": 0.01573962, + "auxiliary_loss_mlp": 0.01434679, + "balance_loss_clip": 1.19936907, + "balance_loss_mlp": 1.08563387, + "epoch": 0.17110563337942644, + "flos": 16291900189440.0, + "grad_norm": 2.222854153671861, + "language_loss": 0.91313565, + "learning_rate": 3.794878087658242e-06, + "loss": 0.94322205, + "num_input_tokens_seen": 30066080, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.4921875, + "step": 1423, + "time_per_iteration": 2.9885640144348145 + }, + { + "auxiliary_loss_clip": 0.01561683, + "auxiliary_loss_mlp": 0.01435055, + "balance_loss_clip": 1.18887424, + "balance_loss_mlp": 1.08372176, + "epoch": 0.17122587627006552, + "flos": 29676502277280.0, + "grad_norm": 1.8870112311839817, + "language_loss": 0.78558105, + "learning_rate": 3.7945343171540873e-06, + "loss": 0.81554842, + "num_input_tokens_seen": 30086955, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.515625, + "step": 1424, + "time_per_iteration": 3.0033857822418213 + }, + { + "auxiliary_loss_clip": 0.01559437, + "auxiliary_loss_mlp": 0.01414505, + "balance_loss_clip": 1.185992, + "balance_loss_mlp": 1.0595479, + "epoch": 0.17134611916070464, + "flos": 25340777169120.0, + "grad_norm": 2.1577072601520166, + "language_loss": 0.79001677, + "learning_rate": 3.7941902744205033e-06, + "loss": 0.81975615, + "num_input_tokens_seen": 30107990, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.55078125, + "step": 1425, + "time_per_iteration": 3.829718589782715 + }, + { + "auxiliary_loss_clip": 0.01566689, + "auxiliary_loss_mlp": 0.01421187, + "balance_loss_clip": 1.19346702, + "balance_loss_mlp": 1.06775546, + "epoch": 0.17146636205134372, + "flos": 13955540145120.0, + "grad_norm": 2.6468381499166656, + "language_loss": 0.83353525, + "learning_rate": 3.7938459595096817e-06, + "loss": 0.86341405, + "num_input_tokens_seen": 30126535, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.53515625, + "step": 1426, + "time_per_iteration": 2.9944283962249756 + }, + { + "auxiliary_loss_clip": 0.0156613, + "auxiliary_loss_mlp": 0.01419418, + "balance_loss_clip": 1.19318986, + "balance_loss_mlp": 1.06655884, + "epoch": 0.1715866049419828, + "flos": 23917508756640.0, + "grad_norm": 2.467396597064041, + "language_loss": 0.86024863, + "learning_rate": 3.7935013724738545e-06, + "loss": 0.89010417, + "num_input_tokens_seen": 30147035, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.52929688, + "step": 1427, + "time_per_iteration": 2.9130003452301025 + }, + { + "auxiliary_loss_clip": 0.01565446, + "auxiliary_loss_mlp": 0.01424021, + "balance_loss_clip": 1.19321978, + "balance_loss_mlp": 1.06830001, + "epoch": 0.17170684783262188, + "flos": 22711495030560.0, + "grad_norm": 2.1329688964369575, + "language_loss": 0.78098941, + "learning_rate": 3.7931565133652945e-06, + "loss": 0.81088406, + "num_input_tokens_seen": 30167110, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.55664062, + "step": 1428, + "time_per_iteration": 3.091874122619629 + }, + { + "auxiliary_loss_clip": 0.01559362, + "auxiliary_loss_mlp": 0.01409824, + "balance_loss_clip": 1.18529248, + "balance_loss_mlp": 1.05448496, + "epoch": 0.171827090723261, + "flos": 26615631234240.0, + "grad_norm": 2.360806302430913, + "language_loss": 0.67970008, + "learning_rate": 3.792811382236317e-06, + "loss": 0.70939195, + "num_input_tokens_seen": 30185620, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.55664062, + "step": 1429, + "time_per_iteration": 3.0175743103027344 + }, + { + "auxiliary_loss_clip": 0.01568232, + "auxiliary_loss_mlp": 0.01404467, + "balance_loss_clip": 1.19467068, + "balance_loss_mlp": 1.05446815, + "epoch": 0.17194733361390008, + "flos": 28151092236960.0, + "grad_norm": 2.2416979530698327, + "language_loss": 0.78811491, + "learning_rate": 3.792465979139279e-06, + "loss": 0.81784183, + "num_input_tokens_seen": 30208225, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 3.50585938, + "step": 1430, + "time_per_iteration": 3.036807060241699 + }, + { + "auxiliary_loss_clip": 0.01713301, + "auxiliary_loss_mlp": 0.01420174, + "balance_loss_clip": 1.34706163, + "balance_loss_mlp": 1.12644196, + "epoch": 0.17206757650453916, + "flos": 65536531977600.0, + "grad_norm": 0.9722426261409092, + "language_loss": 0.65633512, + "learning_rate": 3.792120304126576e-06, + "loss": 0.68766987, + "num_input_tokens_seen": 30271600, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.9375, + "step": 1431, + "time_per_iteration": 3.5448575019836426 + }, + { + "auxiliary_loss_clip": 0.01565434, + "auxiliary_loss_mlp": 0.01399877, + "balance_loss_clip": 1.19276333, + "balance_loss_mlp": 1.04873466, + "epoch": 0.17218781939517827, + "flos": 22275696100320.0, + "grad_norm": 2.0259310972632094, + "language_loss": 0.83747816, + "learning_rate": 3.791774357250649e-06, + "loss": 0.86713123, + "num_input_tokens_seen": 30290430, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.515625, + "step": 1432, + "time_per_iteration": 2.918274402618408 + }, + { + "auxiliary_loss_clip": 0.01565689, + "auxiliary_loss_mlp": 0.01408371, + "balance_loss_clip": 1.19419408, + "balance_loss_mlp": 1.0583725, + "epoch": 0.17230806228581735, + "flos": 14139076332960.0, + "grad_norm": 2.5008502468985188, + "language_loss": 0.79942584, + "learning_rate": 3.7914281385639757e-06, + "loss": 0.82916647, + "num_input_tokens_seen": 30308305, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.50390625, + "step": 1433, + "time_per_iteration": 2.958881139755249 + }, + { + "auxiliary_loss_clip": 0.01559433, + "auxiliary_loss_mlp": 0.01406279, + "balance_loss_clip": 1.18703401, + "balance_loss_mlp": 1.05380058, + "epoch": 0.17242830517645644, + "flos": 20706971736960.0, + "grad_norm": 4.003192059714798, + "language_loss": 0.79770374, + "learning_rate": 3.7910816481190784e-06, + "loss": 0.82736087, + "num_input_tokens_seen": 30328120, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.52734375, + "step": 1434, + "time_per_iteration": 3.1931772232055664 + }, + { + "auxiliary_loss_clip": 0.01560874, + "auxiliary_loss_mlp": 0.01411557, + "balance_loss_clip": 1.18733799, + "balance_loss_mlp": 1.06213069, + "epoch": 0.17254854806709552, + "flos": 30777264266400.0, + "grad_norm": 2.9635149811682027, + "language_loss": 0.74947476, + "learning_rate": 3.7907348859685193e-06, + "loss": 0.77919906, + "num_input_tokens_seen": 30349825, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 3.5, + "step": 1435, + "time_per_iteration": 3.069957733154297 + }, + { + "auxiliary_loss_clip": 0.01562502, + "auxiliary_loss_mlp": 0.01407299, + "balance_loss_clip": 1.18888068, + "balance_loss_mlp": 1.05482137, + "epoch": 0.17266879095773463, + "flos": 26617148360640.0, + "grad_norm": 2.8235035297767825, + "language_loss": 0.80656064, + "learning_rate": 3.790387852164902e-06, + "loss": 0.83625865, + "num_input_tokens_seen": 30370555, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 3.52929688, + "step": 1436, + "time_per_iteration": 3.1558971405029297 + }, + { + "auxiliary_loss_clip": 0.0155923, + "auxiliary_loss_mlp": 0.01418429, + "balance_loss_clip": 1.18558395, + "balance_loss_mlp": 1.06747651, + "epoch": 0.1727890338483737, + "flos": 20268366122880.0, + "grad_norm": 2.237698527771729, + "language_loss": 0.76755071, + "learning_rate": 3.7900405467608707e-06, + "loss": 0.79732728, + "num_input_tokens_seen": 30390100, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.51367188, + "step": 1437, + "time_per_iteration": 2.9894521236419678 + }, + { + "auxiliary_loss_clip": 0.01558002, + "auxiliary_loss_mlp": 0.01433366, + "balance_loss_clip": 1.18407691, + "balance_loss_mlp": 1.08184147, + "epoch": 0.1729092767390128, + "flos": 18181386280800.0, + "grad_norm": 22.07215905042527, + "language_loss": 0.79124796, + "learning_rate": 3.7896929698091114e-06, + "loss": 0.82116163, + "num_input_tokens_seen": 30402915, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 3.51953125, + "step": 1438, + "time_per_iteration": 2.963986396789551 + }, + { + "auxiliary_loss_clip": 0.01569936, + "auxiliary_loss_mlp": 0.01451802, + "balance_loss_clip": 1.19577789, + "balance_loss_mlp": 1.10466444, + "epoch": 0.1730295196296519, + "flos": 26761883676480.0, + "grad_norm": 4.78887540114621, + "language_loss": 0.6861577, + "learning_rate": 3.7893451213623518e-06, + "loss": 0.71637511, + "num_input_tokens_seen": 30420145, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.4765625, + "step": 1439, + "time_per_iteration": 3.0442276000976562 + }, + { + "auxiliary_loss_clip": 0.01562494, + "auxiliary_loss_mlp": 0.01425627, + "balance_loss_clip": 1.1887244, + "balance_loss_mlp": 1.07333946, + "epoch": 0.173149762520291, + "flos": 23844837673440.0, + "grad_norm": 2.722893865682247, + "language_loss": 0.82467365, + "learning_rate": 3.7889970014733606e-06, + "loss": 0.85455477, + "num_input_tokens_seen": 30439250, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.52539062, + "step": 1440, + "time_per_iteration": 2.966399908065796 + }, + { + "auxiliary_loss_clip": 0.01566754, + "auxiliary_loss_mlp": 0.0143029, + "balance_loss_clip": 1.19156766, + "balance_loss_mlp": 1.08048272, + "epoch": 0.17327000541093007, + "flos": 23370465440160.0, + "grad_norm": 6.476377908526672, + "language_loss": 0.78100467, + "learning_rate": 3.7886486101949463e-06, + "loss": 0.81097519, + "num_input_tokens_seen": 30460430, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.50390625, + "step": 1441, + "time_per_iteration": 3.067991018295288 + }, + { + "auxiliary_loss_clip": 0.01573951, + "auxiliary_loss_mlp": 0.01430603, + "balance_loss_clip": 1.20068979, + "balance_loss_mlp": 1.08537292, + "epoch": 0.17339024830156918, + "flos": 18223145549280.0, + "grad_norm": 2.7881367597937086, + "language_loss": 0.8841362, + "learning_rate": 3.7882999475799594e-06, + "loss": 0.91418177, + "num_input_tokens_seen": 30478465, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.45507812, + "step": 1442, + "time_per_iteration": 3.199428081512451 + }, + { + "auxiliary_loss_clip": 0.01569364, + "auxiliary_loss_mlp": 0.01424412, + "balance_loss_clip": 1.19337249, + "balance_loss_mlp": 1.07841933, + "epoch": 0.17351049119220827, + "flos": 23334319539360.0, + "grad_norm": 2.1855728797740137, + "language_loss": 0.81809807, + "learning_rate": 3.787951013681293e-06, + "loss": 0.84803581, + "num_input_tokens_seen": 30496510, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.46484375, + "step": 1443, + "time_per_iteration": 3.04368257522583 + }, + { + "auxiliary_loss_clip": 0.01560553, + "auxiliary_loss_mlp": 0.01419504, + "balance_loss_clip": 1.18599892, + "balance_loss_mlp": 1.07274842, + "epoch": 0.17363073408284735, + "flos": 23805657519840.0, + "grad_norm": 8.260502530400398, + "language_loss": 0.77516007, + "learning_rate": 3.787601808551879e-06, + "loss": 0.80496061, + "num_input_tokens_seen": 30516325, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.47265625, + "step": 1444, + "time_per_iteration": 3.0264410972595215 + }, + { + "auxiliary_loss_clip": 0.01560295, + "auxiliary_loss_mlp": 0.01414623, + "balance_loss_clip": 1.18497467, + "balance_loss_mlp": 1.06824839, + "epoch": 0.17375097697348643, + "flos": 18517281344640.0, + "grad_norm": 6.441007537222864, + "language_loss": 0.84476513, + "learning_rate": 3.7872523322446926e-06, + "loss": 0.87451428, + "num_input_tokens_seen": 30535210, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.46875, + "step": 1445, + "time_per_iteration": 3.825321912765503 + }, + { + "auxiliary_loss_clip": 0.01561418, + "auxiliary_loss_mlp": 0.01416426, + "balance_loss_clip": 1.18504894, + "balance_loss_mlp": 1.07462883, + "epoch": 0.17387121986412554, + "flos": 38881227523680.0, + "grad_norm": 2.1829073077900016, + "language_loss": 0.60435396, + "learning_rate": 3.7869025848127478e-06, + "loss": 0.63413239, + "num_input_tokens_seen": 30559405, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.421875, + "step": 1446, + "time_per_iteration": 3.0555548667907715 + }, + { + "auxiliary_loss_clip": 0.01559103, + "auxiliary_loss_mlp": 0.01407178, + "balance_loss_clip": 1.18363476, + "balance_loss_mlp": 1.05985034, + "epoch": 0.17399146275476463, + "flos": 20377789957440.0, + "grad_norm": 3.2009839345249333, + "language_loss": 0.81384963, + "learning_rate": 3.786552566309102e-06, + "loss": 0.84351242, + "num_input_tokens_seen": 30577615, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.47851562, + "step": 1447, + "time_per_iteration": 3.8141932487487793 + }, + { + "auxiliary_loss_clip": 0.01568132, + "auxiliary_loss_mlp": 0.01428244, + "balance_loss_clip": 1.19261348, + "balance_loss_mlp": 1.0828228, + "epoch": 0.1741117056454037, + "flos": 19165935294720.0, + "grad_norm": 3.44529786558853, + "language_loss": 0.8645891, + "learning_rate": 3.7862022767868517e-06, + "loss": 0.89455283, + "num_input_tokens_seen": 30595205, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.45703125, + "step": 1448, + "time_per_iteration": 2.9387760162353516 + }, + { + "auxiliary_loss_clip": 0.01566562, + "auxiliary_loss_mlp": 0.01408531, + "balance_loss_clip": 1.19105482, + "balance_loss_mlp": 1.06845057, + "epoch": 0.17423194853604282, + "flos": 25376657572800.0, + "grad_norm": 3.047515635967259, + "language_loss": 0.84449589, + "learning_rate": 3.7858517162991367e-06, + "loss": 0.87424684, + "num_input_tokens_seen": 30615280, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.40429688, + "step": 1449, + "time_per_iteration": 3.742762565612793 + }, + { + "auxiliary_loss_clip": 0.01558888, + "auxiliary_loss_mlp": 0.01391722, + "balance_loss_clip": 1.18300629, + "balance_loss_mlp": 1.04592013, + "epoch": 0.1743521914266819, + "flos": 25194221301600.0, + "grad_norm": 2.908858086406551, + "language_loss": 0.60749102, + "learning_rate": 3.7855008848991363e-06, + "loss": 0.6369971, + "num_input_tokens_seen": 30633485, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.46289062, + "step": 1450, + "time_per_iteration": 3.015683889389038 + }, + { + "auxiliary_loss_clip": 0.01577679, + "auxiliary_loss_mlp": 0.01394092, + "balance_loss_clip": 1.20225644, + "balance_loss_mlp": 1.04714537, + "epoch": 0.17447243431732098, + "flos": 25668972816480.0, + "grad_norm": 2.260352575659926, + "language_loss": 0.77892387, + "learning_rate": 3.7851497826400714e-06, + "loss": 0.80864161, + "num_input_tokens_seen": 30653625, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.47460938, + "step": 1451, + "time_per_iteration": 2.9389469623565674 + }, + { + "auxiliary_loss_clip": 0.01573876, + "auxiliary_loss_mlp": 0.01405238, + "balance_loss_clip": 1.19870412, + "balance_loss_mlp": 1.06229639, + "epoch": 0.17459267720796007, + "flos": 36284829464160.0, + "grad_norm": 2.1932702403828213, + "language_loss": 0.76315242, + "learning_rate": 3.7847984095752034e-06, + "loss": 0.7929436, + "num_input_tokens_seen": 30677080, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.43359375, + "step": 1452, + "time_per_iteration": 3.2006165981292725 + }, + { + "auxiliary_loss_clip": 0.01568586, + "auxiliary_loss_mlp": 0.01394033, + "balance_loss_clip": 1.19255567, + "balance_loss_mlp": 1.05204546, + "epoch": 0.17471292009859918, + "flos": 20013182912160.0, + "grad_norm": 2.3279794879922893, + "language_loss": 0.80464375, + "learning_rate": 3.784446765757836e-06, + "loss": 0.83427, + "num_input_tokens_seen": 30695725, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.42382812, + "step": 1453, + "time_per_iteration": 3.9166922569274902 + }, + { + "auxiliary_loss_clip": 0.01574703, + "auxiliary_loss_mlp": 0.01387099, + "balance_loss_clip": 1.19898736, + "balance_loss_mlp": 1.04587483, + "epoch": 0.17483316298923826, + "flos": 27821758744800.0, + "grad_norm": 2.4949117309129947, + "language_loss": 0.77579975, + "learning_rate": 3.7840948512413133e-06, + "loss": 0.80541778, + "num_input_tokens_seen": 30713310, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.41601562, + "step": 1454, + "time_per_iteration": 3.184473991394043 + }, + { + "auxiliary_loss_clip": 0.01564323, + "auxiliary_loss_mlp": 0.01388337, + "balance_loss_clip": 1.18894577, + "balance_loss_mlp": 1.04177189, + "epoch": 0.17495340587987734, + "flos": 44021303136000.0, + "grad_norm": 2.204579753797275, + "language_loss": 0.78663552, + "learning_rate": 3.7837426660790196e-06, + "loss": 0.81616211, + "num_input_tokens_seen": 30734725, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.47070312, + "step": 1455, + "time_per_iteration": 3.1363790035247803 + }, + { + "auxiliary_loss_clip": 0.01575235, + "auxiliary_loss_mlp": 0.01406574, + "balance_loss_clip": 1.20032096, + "balance_loss_mlp": 1.06287026, + "epoch": 0.17507364877051645, + "flos": 20887890881760.0, + "grad_norm": 2.797717463143788, + "language_loss": 0.81768644, + "learning_rate": 3.783390210324382e-06, + "loss": 0.8475045, + "num_input_tokens_seen": 30754450, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.44140625, + "step": 1456, + "time_per_iteration": 3.016894578933716 + }, + { + "auxiliary_loss_clip": 0.0156589, + "auxiliary_loss_mlp": 0.01385983, + "balance_loss_clip": 1.19081688, + "balance_loss_mlp": 1.04533052, + "epoch": 0.17519389166115554, + "flos": 24719924924640.0, + "grad_norm": 2.3242278480365353, + "language_loss": 0.72174805, + "learning_rate": 3.7830374840308676e-06, + "loss": 0.75126684, + "num_input_tokens_seen": 30774605, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.41015625, + "step": 1457, + "time_per_iteration": 2.9913077354431152 + }, + { + "auxiliary_loss_clip": 0.01579668, + "auxiliary_loss_mlp": 0.01390079, + "balance_loss_clip": 1.20433009, + "balance_loss_mlp": 1.04904509, + "epoch": 0.17531413455179462, + "flos": 23800006224000.0, + "grad_norm": 8.721009405023912, + "language_loss": 0.82853436, + "learning_rate": 3.7826844872519842e-06, + "loss": 0.85823184, + "num_input_tokens_seen": 30792460, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.4140625, + "step": 1458, + "time_per_iteration": 3.0250096321105957 + }, + { + "auxiliary_loss_clip": 0.01579647, + "auxiliary_loss_mlp": 0.01378661, + "balance_loss_clip": 1.20273006, + "balance_loss_mlp": 1.03705478, + "epoch": 0.1754343774424337, + "flos": 24574734470880.0, + "grad_norm": 2.1535991461351083, + "language_loss": 0.72384131, + "learning_rate": 3.782331220041282e-06, + "loss": 0.75342435, + "num_input_tokens_seen": 30812525, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.41992188, + "step": 1459, + "time_per_iteration": 3.1324944496154785 + }, + { + "auxiliary_loss_clip": 0.01571225, + "auxiliary_loss_mlp": 0.0138343, + "balance_loss_clip": 1.19511318, + "balance_loss_mlp": 1.03800941, + "epoch": 0.17555462033307281, + "flos": 18116604254880.0, + "grad_norm": 2.5889839333879108, + "language_loss": 0.83124089, + "learning_rate": 3.7819776824523504e-06, + "loss": 0.86078739, + "num_input_tokens_seen": 30830390, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.45898438, + "step": 1460, + "time_per_iteration": 3.0106828212738037 + }, + { + "auxiliary_loss_clip": 0.01576327, + "auxiliary_loss_mlp": 0.01385055, + "balance_loss_clip": 1.19982362, + "balance_loss_mlp": 1.0415417, + "epoch": 0.1756748632237119, + "flos": 28368536564160.0, + "grad_norm": 3.8342084801275793, + "language_loss": 0.84166801, + "learning_rate": 3.7816238745388213e-06, + "loss": 0.87128186, + "num_input_tokens_seen": 30849935, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.43945312, + "step": 1461, + "time_per_iteration": 3.0706591606140137 + }, + { + "auxiliary_loss_clip": 0.01566018, + "auxiliary_loss_mlp": 0.01385791, + "balance_loss_clip": 1.1907258, + "balance_loss_mlp": 1.04666483, + "epoch": 0.17579510611435098, + "flos": 25734968543520.0, + "grad_norm": 2.2333286528768563, + "language_loss": 0.87225425, + "learning_rate": 3.781269796354367e-06, + "loss": 0.90177238, + "num_input_tokens_seen": 30869555, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.39453125, + "step": 1462, + "time_per_iteration": 3.046475648880005 + }, + { + "auxiliary_loss_clip": 0.01567062, + "auxiliary_loss_mlp": 0.01407114, + "balance_loss_clip": 1.19085121, + "balance_loss_mlp": 1.06417239, + "epoch": 0.1759153490049901, + "flos": 18590066212320.0, + "grad_norm": 2.1305174253833803, + "language_loss": 0.86185968, + "learning_rate": 3.7809154479527006e-06, + "loss": 0.89160132, + "num_input_tokens_seen": 30888760, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.43359375, + "step": 1463, + "time_per_iteration": 3.0434978008270264 + }, + { + "auxiliary_loss_clip": 0.01577128, + "auxiliary_loss_mlp": 0.0145993, + "balance_loss_clip": 1.20262706, + "balance_loss_mlp": 1.1206131, + "epoch": 0.17603559189562917, + "flos": 18621015955200.0, + "grad_norm": 10.707003011176765, + "language_loss": 0.84667003, + "learning_rate": 3.780560829387577e-06, + "loss": 0.87704062, + "num_input_tokens_seen": 30907260, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.39648438, + "step": 1464, + "time_per_iteration": 3.002901554107666 + }, + { + "auxiliary_loss_clip": 0.01768824, + "auxiliary_loss_mlp": 0.0164624, + "balance_loss_clip": 1.39985764, + "balance_loss_mlp": 1.29376221, + "epoch": 0.17615583478626826, + "flos": 60536261020320.0, + "grad_norm": 0.9472920343621677, + "language_loss": 0.57884443, + "learning_rate": 3.7802059407127915e-06, + "loss": 0.61299515, + "num_input_tokens_seen": 30965810, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.53125, + "step": 1465, + "time_per_iteration": 3.4401657581329346 + }, + { + "auxiliary_loss_clip": 0.0156934, + "auxiliary_loss_mlp": 0.01639803, + "balance_loss_clip": 1.19421387, + "balance_loss_mlp": 1.31955898, + "epoch": 0.17627607767690734, + "flos": 23618100947040.0, + "grad_norm": 4.294777256463213, + "language_loss": 0.86373079, + "learning_rate": 3.7798507819821797e-06, + "loss": 0.89582217, + "num_input_tokens_seen": 30982935, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.20117188, + "step": 1466, + "time_per_iteration": 3.0143792629241943 + }, + { + "auxiliary_loss_clip": 0.0157071, + "auxiliary_loss_mlp": 0.01610216, + "balance_loss_clip": 1.19464779, + "balance_loss_mlp": 1.28634834, + "epoch": 0.17639632056754645, + "flos": 17640980392320.0, + "grad_norm": 2.58151981514539, + "language_loss": 0.79246211, + "learning_rate": 3.7794953532496197e-06, + "loss": 0.82427138, + "num_input_tokens_seen": 30998840, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.23828125, + "step": 1467, + "time_per_iteration": 2.9337387084960938 + }, + { + "auxiliary_loss_clip": 0.0176742, + "auxiliary_loss_mlp": 0.0146616, + "balance_loss_clip": 1.39852476, + "balance_loss_mlp": 1.16174698, + "epoch": 0.17651656345818553, + "flos": 57939559171200.0, + "grad_norm": 0.8843222419091527, + "language_loss": 0.57890666, + "learning_rate": 3.7791396545690295e-06, + "loss": 0.61124241, + "num_input_tokens_seen": 31060075, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.0390625, + "step": 1468, + "time_per_iteration": 3.3569650650024414 + }, + { + "auxiliary_loss_clip": 0.01575151, + "auxiliary_loss_mlp": 0.0150529, + "balance_loss_clip": 1.20029736, + "balance_loss_mlp": 1.16368365, + "epoch": 0.17663680634882462, + "flos": 22931708113440.0, + "grad_norm": 3.0797205624905275, + "language_loss": 0.80762839, + "learning_rate": 3.7787836859943685e-06, + "loss": 0.83843279, + "num_input_tokens_seen": 31078800, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.41992188, + "step": 1469, + "time_per_iteration": 3.0053160190582275 + }, + { + "auxiliary_loss_clip": 0.01569183, + "auxiliary_loss_mlp": 0.01455753, + "balance_loss_clip": 1.19308937, + "balance_loss_mlp": 1.11014104, + "epoch": 0.17675704923946373, + "flos": 22640189361120.0, + "grad_norm": 10.813165673663555, + "language_loss": 0.79042292, + "learning_rate": 3.7784274475796363e-06, + "loss": 0.82067227, + "num_input_tokens_seen": 31097430, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.4609375, + "step": 1470, + "time_per_iteration": 2.9588329792022705 + }, + { + "auxiliary_loss_clip": 0.01575063, + "auxiliary_loss_mlp": 0.01397914, + "balance_loss_clip": 1.19983792, + "balance_loss_mlp": 1.04886889, + "epoch": 0.1768772921301028, + "flos": 27129031908480.0, + "grad_norm": 3.04697441866358, + "language_loss": 0.76109135, + "learning_rate": 3.7780709393788745e-06, + "loss": 0.79082114, + "num_input_tokens_seen": 31117905, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.49609375, + "step": 1471, + "time_per_iteration": 3.0019326210021973 + }, + { + "auxiliary_loss_clip": 0.01578008, + "auxiliary_loss_mlp": 0.01392376, + "balance_loss_clip": 1.2029382, + "balance_loss_mlp": 1.04752731, + "epoch": 0.1769975350207419, + "flos": 19174165705440.0, + "grad_norm": 3.060696509802379, + "language_loss": 0.75285006, + "learning_rate": 3.777714161446165e-06, + "loss": 0.78255391, + "num_input_tokens_seen": 31137610, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.453125, + "step": 1472, + "time_per_iteration": 3.011204242706299 + }, + { + "auxiliary_loss_clip": 0.01581952, + "auxiliary_loss_mlp": 0.01396167, + "balance_loss_clip": 1.20739102, + "balance_loss_mlp": 1.05227208, + "epoch": 0.177117777911381, + "flos": 36137742602400.0, + "grad_norm": 2.361748107014131, + "language_loss": 0.69325256, + "learning_rate": 3.7773571138356304e-06, + "loss": 0.72303367, + "num_input_tokens_seen": 31157780, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.44335938, + "step": 1473, + "time_per_iteration": 3.8400919437408447 + }, + { + "auxiliary_loss_clip": 0.01591264, + "auxiliary_loss_mlp": 0.01373737, + "balance_loss_clip": 1.21650875, + "balance_loss_mlp": 1.03594553, + "epoch": 0.17723802080202009, + "flos": 22092766763040.0, + "grad_norm": 3.695393801735543, + "language_loss": 0.89086044, + "learning_rate": 3.776999796601435e-06, + "loss": 0.92051053, + "num_input_tokens_seen": 31176540, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.38085938, + "step": 1474, + "time_per_iteration": 3.0255343914031982 + }, + { + "auxiliary_loss_clip": 0.01576509, + "auxiliary_loss_mlp": 0.01387175, + "balance_loss_clip": 1.20091319, + "balance_loss_mlp": 1.04556847, + "epoch": 0.17735826369265917, + "flos": 30225062720160.0, + "grad_norm": 2.2780661401985114, + "language_loss": 0.72568512, + "learning_rate": 3.776642209797783e-06, + "loss": 0.75532198, + "num_input_tokens_seen": 31198370, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.41992188, + "step": 1475, + "time_per_iteration": 3.9759202003479004 + }, + { + "auxiliary_loss_clip": 0.01581365, + "auxiliary_loss_mlp": 0.01400398, + "balance_loss_clip": 1.20820296, + "balance_loss_mlp": 1.05745697, + "epoch": 0.17747850658329825, + "flos": 21399736501440.0, + "grad_norm": 2.6049924721760527, + "language_loss": 0.7794323, + "learning_rate": 3.7762843534789205e-06, + "loss": 0.80924988, + "num_input_tokens_seen": 31217120, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.43359375, + "step": 1476, + "time_per_iteration": 3.124725580215454 + }, + { + "auxiliary_loss_clip": 0.01582154, + "auxiliary_loss_mlp": 0.0137794, + "balance_loss_clip": 1.20740938, + "balance_loss_mlp": 1.03404498, + "epoch": 0.17759874947393736, + "flos": 16985271804480.0, + "grad_norm": 2.39593983124295, + "language_loss": 0.88633323, + "learning_rate": 3.7759262276991343e-06, + "loss": 0.91593415, + "num_input_tokens_seen": 31234730, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.44335938, + "step": 1477, + "time_per_iteration": 3.7775001525878906 + }, + { + "auxiliary_loss_clip": 0.0158305, + "auxiliary_loss_mlp": 0.01401835, + "balance_loss_clip": 1.20894432, + "balance_loss_mlp": 1.05870318, + "epoch": 0.17771899236457644, + "flos": 11547684790560.0, + "grad_norm": 2.342924782243274, + "language_loss": 0.80488467, + "learning_rate": 3.7755678325127506e-06, + "loss": 0.83473349, + "num_input_tokens_seen": 31252410, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.43554688, + "step": 1478, + "time_per_iteration": 3.091989278793335 + }, + { + "auxiliary_loss_clip": 0.0156937, + "auxiliary_loss_mlp": 0.01392297, + "balance_loss_clip": 1.19444489, + "balance_loss_mlp": 1.05145419, + "epoch": 0.17783923525521553, + "flos": 18809786229120.0, + "grad_norm": 1.7376087154247173, + "language_loss": 0.753443, + "learning_rate": 3.7752091679741393e-06, + "loss": 0.78305972, + "num_input_tokens_seen": 31270200, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.41210938, + "step": 1479, + "time_per_iteration": 2.9664008617401123 + }, + { + "auxiliary_loss_clip": 0.0157332, + "auxiliary_loss_mlp": 0.0142761, + "balance_loss_clip": 1.19900656, + "balance_loss_mlp": 1.08676648, + "epoch": 0.17795947814585464, + "flos": 30410950453920.0, + "grad_norm": 2.8279810013224607, + "language_loss": 0.77108651, + "learning_rate": 3.774850234137708e-06, + "loss": 0.80109584, + "num_input_tokens_seen": 31287495, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.41210938, + "step": 1480, + "time_per_iteration": 3.7638697624206543 + }, + { + "auxiliary_loss_clip": 0.01585843, + "auxiliary_loss_mlp": 0.01458075, + "balance_loss_clip": 1.21089303, + "balance_loss_mlp": 1.1160872, + "epoch": 0.17807972103649372, + "flos": 24391349995680.0, + "grad_norm": 3.080096422256126, + "language_loss": 0.82626021, + "learning_rate": 3.7744910310579076e-06, + "loss": 0.85669941, + "num_input_tokens_seen": 31306420, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.42382812, + "step": 1481, + "time_per_iteration": 3.044294595718384 + }, + { + "auxiliary_loss_clip": 0.01580258, + "auxiliary_loss_mlp": 0.01547418, + "balance_loss_clip": 1.20632195, + "balance_loss_mlp": 1.21344149, + "epoch": 0.1781999639271328, + "flos": 20303563819680.0, + "grad_norm": 2.625332141215349, + "language_loss": 0.85128051, + "learning_rate": 3.774131558789229e-06, + "loss": 0.88255727, + "num_input_tokens_seen": 31325750, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.34179688, + "step": 1482, + "time_per_iteration": 3.087101697921753 + }, + { + "auxiliary_loss_clip": 0.01574223, + "auxiliary_loss_mlp": 0.01550468, + "balance_loss_clip": 1.19890666, + "balance_loss_mlp": 1.21229506, + "epoch": 0.1783202068177719, + "flos": 15926307012000.0, + "grad_norm": 2.402095041921131, + "language_loss": 0.69710267, + "learning_rate": 3.773771817386203e-06, + "loss": 0.72834957, + "num_input_tokens_seen": 31343080, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.38476562, + "step": 1483, + "time_per_iteration": 2.9857852458953857 + }, + { + "auxiliary_loss_clip": 0.01575485, + "auxiliary_loss_mlp": 0.01404462, + "balance_loss_clip": 1.1993593, + "balance_loss_mlp": 1.05770564, + "epoch": 0.178440449708411, + "flos": 20633883444000.0, + "grad_norm": 2.3726835186376776, + "language_loss": 0.79173779, + "learning_rate": 3.773411806903403e-06, + "loss": 0.82153726, + "num_input_tokens_seen": 31362160, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.47265625, + "step": 1484, + "time_per_iteration": 2.979536533355713 + }, + { + "auxiliary_loss_clip": 0.01576748, + "auxiliary_loss_mlp": 0.01397177, + "balance_loss_clip": 1.20204234, + "balance_loss_mlp": 1.05480838, + "epoch": 0.17856069259905008, + "flos": 21688031360160.0, + "grad_norm": 1.9916204976070768, + "language_loss": 0.94944745, + "learning_rate": 3.7730515273954415e-06, + "loss": 0.97918665, + "num_input_tokens_seen": 31380770, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.42773438, + "step": 1485, + "time_per_iteration": 2.9946699142456055 + }, + { + "auxiliary_loss_clip": 0.01583176, + "auxiliary_loss_mlp": 0.01386682, + "balance_loss_clip": 1.20697606, + "balance_loss_mlp": 1.04545712, + "epoch": 0.17868093548968916, + "flos": 26575085666880.0, + "grad_norm": 2.7967524018032495, + "language_loss": 0.8513369, + "learning_rate": 3.772690978916973e-06, + "loss": 0.88103545, + "num_input_tokens_seen": 31400525, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.41601562, + "step": 1486, + "time_per_iteration": 3.0179731845855713 + }, + { + "auxiliary_loss_clip": 0.01570127, + "auxiliary_loss_mlp": 0.01418868, + "balance_loss_clip": 1.194736, + "balance_loss_mlp": 1.07935941, + "epoch": 0.17880117838032827, + "flos": 18582594364800.0, + "grad_norm": 2.152192597704264, + "language_loss": 0.86406791, + "learning_rate": 3.772330161522693e-06, + "loss": 0.89395785, + "num_input_tokens_seen": 31418435, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.3984375, + "step": 1487, + "time_per_iteration": 2.9641690254211426 + }, + { + "auxiliary_loss_clip": 0.01586606, + "auxiliary_loss_mlp": 0.01440748, + "balance_loss_clip": 1.21127486, + "balance_loss_mlp": 1.10162103, + "epoch": 0.17892142127096736, + "flos": 26543604929760.0, + "grad_norm": 2.148251609399747, + "language_loss": 0.80117059, + "learning_rate": 3.7719690752673365e-06, + "loss": 0.83144414, + "num_input_tokens_seen": 31439230, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.39453125, + "step": 1488, + "time_per_iteration": 3.1012144088745117 + }, + { + "auxiliary_loss_clip": 0.0157758, + "auxiliary_loss_mlp": 0.01411182, + "balance_loss_clip": 1.20241821, + "balance_loss_mlp": 1.07033908, + "epoch": 0.17904166416160644, + "flos": 23874573715200.0, + "grad_norm": 2.3857297620452083, + "language_loss": 0.78339082, + "learning_rate": 3.7716077202056796e-06, + "loss": 0.8132785, + "num_input_tokens_seen": 31457705, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.41210938, + "step": 1489, + "time_per_iteration": 2.9841506481170654 + }, + { + "auxiliary_loss_clip": 0.015749, + "auxiliary_loss_mlp": 0.01398843, + "balance_loss_clip": 1.19836164, + "balance_loss_mlp": 1.05723667, + "epoch": 0.17916190705224552, + "flos": 19136388893760.0, + "grad_norm": 2.6596635276914466, + "language_loss": 0.93431783, + "learning_rate": 3.7712460963925404e-06, + "loss": 0.96405524, + "num_input_tokens_seen": 31473645, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 3.41992188, + "step": 1490, + "time_per_iteration": 2.9520905017852783 + }, + { + "auxiliary_loss_clip": 0.01584466, + "auxiliary_loss_mlp": 0.0142215, + "balance_loss_clip": 1.20969296, + "balance_loss_mlp": 1.07844627, + "epoch": 0.17928214994288463, + "flos": 25154282584800.0, + "grad_norm": 6.181389340422237, + "language_loss": 0.75210571, + "learning_rate": 3.7708842038827775e-06, + "loss": 0.78217185, + "num_input_tokens_seen": 31492605, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.44140625, + "step": 1491, + "time_per_iteration": 3.020284652709961 + }, + { + "auxiliary_loss_clip": 0.01578183, + "auxiliary_loss_mlp": 0.01503294, + "balance_loss_clip": 1.20326257, + "balance_loss_mlp": 1.16340423, + "epoch": 0.17940239283352372, + "flos": 22386940486560.0, + "grad_norm": 2.2533124978256427, + "language_loss": 0.85785228, + "learning_rate": 3.770522042731288e-06, + "loss": 0.88866711, + "num_input_tokens_seen": 31514500, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.40039062, + "step": 1492, + "time_per_iteration": 3.024512767791748 + }, + { + "auxiliary_loss_clip": 0.01587397, + "auxiliary_loss_mlp": 0.01579547, + "balance_loss_clip": 1.21110523, + "balance_loss_mlp": 1.25110114, + "epoch": 0.1795226357241628, + "flos": 23180557321440.0, + "grad_norm": 2.1697844966403723, + "language_loss": 0.88005984, + "learning_rate": 3.7701596129930122e-06, + "loss": 0.91172934, + "num_input_tokens_seen": 31533225, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.28515625, + "step": 1493, + "time_per_iteration": 3.0730443000793457 + }, + { + "auxiliary_loss_clip": 0.01584465, + "auxiliary_loss_mlp": 0.01397097, + "balance_loss_clip": 1.20934594, + "balance_loss_mlp": 1.05243874, + "epoch": 0.1796428786148019, + "flos": 22092577122240.0, + "grad_norm": 2.596953174400904, + "language_loss": 0.73452353, + "learning_rate": 3.7697969147229315e-06, + "loss": 0.76433909, + "num_input_tokens_seen": 31551385, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.45117188, + "step": 1494, + "time_per_iteration": 3.029029130935669 + }, + { + "auxiliary_loss_clip": 0.01586568, + "auxiliary_loss_mlp": 0.0141331, + "balance_loss_clip": 1.21110678, + "balance_loss_mlp": 1.07036841, + "epoch": 0.179763121505441, + "flos": 21326761992960.0, + "grad_norm": 3.140831760975036, + "language_loss": 0.85591316, + "learning_rate": 3.7694339479760647e-06, + "loss": 0.88591188, + "num_input_tokens_seen": 31570415, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.43359375, + "step": 1495, + "time_per_iteration": 3.015425682067871 + }, + { + "auxiliary_loss_clip": 0.01750815, + "auxiliary_loss_mlp": 0.01362396, + "balance_loss_clip": 1.38261211, + "balance_loss_mlp": 1.05874634, + "epoch": 0.17988336439608008, + "flos": 68168089805760.0, + "grad_norm": 0.812503954087375, + "language_loss": 0.57345343, + "learning_rate": 3.769070712807476e-06, + "loss": 0.60458553, + "num_input_tokens_seen": 31632445, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.03125, + "step": 1496, + "time_per_iteration": 3.5577521324157715 + }, + { + "auxiliary_loss_clip": 0.01581442, + "auxiliary_loss_mlp": 0.01392215, + "balance_loss_clip": 1.20586014, + "balance_loss_mlp": 1.04946446, + "epoch": 0.18000360728671919, + "flos": 21947234955840.0, + "grad_norm": 2.6396734814305276, + "language_loss": 0.78682637, + "learning_rate": 3.768707209272266e-06, + "loss": 0.81656289, + "num_input_tokens_seen": 31652575, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.43164062, + "step": 1497, + "time_per_iteration": 3.029200792312622 + }, + { + "auxiliary_loss_clip": 0.01590175, + "auxiliary_loss_mlp": 0.01406885, + "balance_loss_clip": 1.21539474, + "balance_loss_mlp": 1.06203651, + "epoch": 0.18012385017735827, + "flos": 18988164187200.0, + "grad_norm": 2.5194574064977284, + "language_loss": 0.77045429, + "learning_rate": 3.768343437425579e-06, + "loss": 0.80042487, + "num_input_tokens_seen": 31671145, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.453125, + "step": 1498, + "time_per_iteration": 3.045792818069458 + }, + { + "auxiliary_loss_clip": 0.01585996, + "auxiliary_loss_mlp": 0.01391987, + "balance_loss_clip": 1.21182692, + "balance_loss_mlp": 1.05095315, + "epoch": 0.18024409306799735, + "flos": 19749731362560.0, + "grad_norm": 3.1216300904317067, + "language_loss": 0.86526394, + "learning_rate": 3.7679793973225987e-06, + "loss": 0.89504379, + "num_input_tokens_seen": 31686955, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.4140625, + "step": 1499, + "time_per_iteration": 3.0558018684387207 + }, + { + "auxiliary_loss_clip": 0.01748603, + "auxiliary_loss_mlp": 0.0131337, + "balance_loss_clip": 1.38145947, + "balance_loss_mlp": 1.01048279, + "epoch": 0.18036433595863643, + "flos": 67234137321600.0, + "grad_norm": 0.881082282666852, + "language_loss": 0.61565918, + "learning_rate": 3.767615089018549e-06, + "loss": 0.64627892, + "num_input_tokens_seen": 31749300, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.0234375, + "step": 1500, + "time_per_iteration": 4.221448183059692 + }, + { + "auxiliary_loss_clip": 0.01588266, + "auxiliary_loss_mlp": 0.01402832, + "balance_loss_clip": 1.21586645, + "balance_loss_mlp": 1.0585556, + "epoch": 0.18048457884927555, + "flos": 18183130976160.0, + "grad_norm": 2.214521957778854, + "language_loss": 0.8580333, + "learning_rate": 3.7672505125686966e-06, + "loss": 0.88794422, + "num_input_tokens_seen": 31765665, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.44726562, + "step": 1501, + "time_per_iteration": 3.040142297744751 + }, + { + "auxiliary_loss_clip": 0.01594841, + "auxiliary_loss_mlp": 0.01409735, + "balance_loss_clip": 1.22103238, + "balance_loss_mlp": 1.06564903, + "epoch": 0.18060482173991463, + "flos": 15815517763680.0, + "grad_norm": 2.844683030841449, + "language_loss": 0.83960861, + "learning_rate": 3.7668856680283455e-06, + "loss": 0.8696543, + "num_input_tokens_seen": 31782690, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.4453125, + "step": 1502, + "time_per_iteration": 3.877021551132202 + }, + { + "auxiliary_loss_clip": 0.01593979, + "auxiliary_loss_mlp": 0.01583798, + "balance_loss_clip": 1.22230387, + "balance_loss_mlp": 1.25878608, + "epoch": 0.1807250646305537, + "flos": 18589345577280.0, + "grad_norm": 2.055737599590565, + "language_loss": 0.82456928, + "learning_rate": 3.7665205554528437e-06, + "loss": 0.85634708, + "num_input_tokens_seen": 31802045, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.25, + "step": 1503, + "time_per_iteration": 3.046849012374878 + }, + { + "auxiliary_loss_clip": 0.01588803, + "auxiliary_loss_mlp": 0.01570434, + "balance_loss_clip": 1.21705341, + "balance_loss_mlp": 1.2435143, + "epoch": 0.18084530752119282, + "flos": 23151390202080.0, + "grad_norm": 2.5400349564907585, + "language_loss": 0.74528611, + "learning_rate": 3.7661551748975782e-06, + "loss": 0.77687848, + "num_input_tokens_seen": 31820220, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.26953125, + "step": 1504, + "time_per_iteration": 3.7896034717559814 + }, + { + "auxiliary_loss_clip": 0.01737206, + "auxiliary_loss_mlp": 0.01518047, + "balance_loss_clip": 1.37332654, + "balance_loss_mlp": 1.21821213, + "epoch": 0.1809655504118319, + "flos": 59808715768800.0, + "grad_norm": 0.8219597604755762, + "language_loss": 0.60446656, + "learning_rate": 3.7657895264179772e-06, + "loss": 0.63701904, + "num_input_tokens_seen": 31876195, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.9921875, + "step": 1505, + "time_per_iteration": 3.4608168601989746 + }, + { + "auxiliary_loss_clip": 0.01603197, + "auxiliary_loss_mlp": 0.0155106, + "balance_loss_clip": 1.23185062, + "balance_loss_mlp": 1.21784627, + "epoch": 0.181085793302471, + "flos": 44205408246240.0, + "grad_norm": 2.0508842110104037, + "language_loss": 0.74341309, + "learning_rate": 3.765423610069509e-06, + "loss": 0.77495575, + "num_input_tokens_seen": 31901585, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.33398438, + "step": 1506, + "time_per_iteration": 3.1601250171661377 + }, + { + "auxiliary_loss_clip": 0.01603687, + "auxiliary_loss_mlp": 0.01399782, + "balance_loss_clip": 1.23267257, + "balance_loss_mlp": 1.05550539, + "epoch": 0.18120603619311007, + "flos": 34901348055840.0, + "grad_norm": 2.171024373839619, + "language_loss": 0.72852403, + "learning_rate": 3.765057425907683e-06, + "loss": 0.75855869, + "num_input_tokens_seen": 31923045, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.44726562, + "step": 1507, + "time_per_iteration": 3.821770191192627 + }, + { + "auxiliary_loss_clip": 0.01594545, + "auxiliary_loss_mlp": 0.01406527, + "balance_loss_clip": 1.22334266, + "balance_loss_mlp": 1.06072426, + "epoch": 0.18132627908374918, + "flos": 21508894838880.0, + "grad_norm": 2.0581984225645495, + "language_loss": 0.7868005, + "learning_rate": 3.764690973988048e-06, + "loss": 0.8168112, + "num_input_tokens_seen": 31943385, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.46289062, + "step": 1508, + "time_per_iteration": 3.0038721561431885 + }, + { + "auxiliary_loss_clip": 0.01598374, + "auxiliary_loss_mlp": 0.01399671, + "balance_loss_clip": 1.22769117, + "balance_loss_mlp": 1.05005383, + "epoch": 0.18144652197438826, + "flos": 29060163483840.0, + "grad_norm": 2.1067817525618096, + "language_loss": 0.7407307, + "learning_rate": 3.7643242543661967e-06, + "loss": 0.77071118, + "num_input_tokens_seen": 31966045, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.50195312, + "step": 1509, + "time_per_iteration": 3.013679027557373 + }, + { + "auxiliary_loss_clip": 0.01728631, + "auxiliary_loss_mlp": 0.01346138, + "balance_loss_clip": 1.36459577, + "balance_loss_mlp": 1.03638458, + "epoch": 0.18156676486502735, + "flos": 68681452551840.0, + "grad_norm": 0.8345078272348913, + "language_loss": 0.60492313, + "learning_rate": 3.7639572670977573e-06, + "loss": 0.63567078, + "num_input_tokens_seen": 32021540, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.09375, + "step": 1510, + "time_per_iteration": 3.3924970626831055 + }, + { + "auxiliary_loss_clip": 0.01590172, + "auxiliary_loss_mlp": 0.01415531, + "balance_loss_clip": 1.21799231, + "balance_loss_mlp": 1.07430601, + "epoch": 0.18168700775566646, + "flos": 26474081883840.0, + "grad_norm": 1.5279246391620738, + "language_loss": 0.76483428, + "learning_rate": 3.7635900122384042e-06, + "loss": 0.79489124, + "num_input_tokens_seen": 32044535, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.41601562, + "step": 1511, + "time_per_iteration": 3.1635537147521973 + }, + { + "auxiliary_loss_clip": 0.01593594, + "auxiliary_loss_mlp": 0.01524767, + "balance_loss_clip": 1.22186637, + "balance_loss_mlp": 1.1856401, + "epoch": 0.18180725064630554, + "flos": 15007071018240.0, + "grad_norm": 2.3001701595380357, + "language_loss": 0.87210011, + "learning_rate": 3.7632224898438477e-06, + "loss": 0.90328366, + "num_input_tokens_seen": 32061010, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.39453125, + "step": 1512, + "time_per_iteration": 3.024942398071289 + }, + { + "auxiliary_loss_clip": 0.01592434, + "auxiliary_loss_mlp": 0.01516529, + "balance_loss_clip": 1.22062135, + "balance_loss_mlp": 1.16805696, + "epoch": 0.18192749353694462, + "flos": 19684418342400.0, + "grad_norm": 1.6415423694441387, + "language_loss": 0.79327822, + "learning_rate": 3.762854699969842e-06, + "loss": 0.82436782, + "num_input_tokens_seen": 32081520, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.48828125, + "step": 1513, + "time_per_iteration": 3.0272207260131836 + }, + { + "auxiliary_loss_clip": 0.01586485, + "auxiliary_loss_mlp": 0.01512419, + "balance_loss_clip": 1.21357381, + "balance_loss_mlp": 1.16795182, + "epoch": 0.1820477364275837, + "flos": 20704733975520.0, + "grad_norm": 2.0693835569062484, + "language_loss": 0.73248541, + "learning_rate": 3.762486642672179e-06, + "loss": 0.76347446, + "num_input_tokens_seen": 32098460, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.44921875, + "step": 1514, + "time_per_iteration": 3.013460636138916 + }, + { + "auxiliary_loss_clip": 0.01591414, + "auxiliary_loss_mlp": 0.01470188, + "balance_loss_clip": 1.2188952, + "balance_loss_mlp": 1.11961722, + "epoch": 0.18216797931822282, + "flos": 17130613970880.0, + "grad_norm": 2.0281613638936355, + "language_loss": 0.86750281, + "learning_rate": 3.7621183180066946e-06, + "loss": 0.89811879, + "num_input_tokens_seen": 32116420, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.50585938, + "step": 1515, + "time_per_iteration": 2.977465867996216 + }, + { + "auxiliary_loss_clip": 0.01591954, + "auxiliary_loss_mlp": 0.01418381, + "balance_loss_clip": 1.21841729, + "balance_loss_mlp": 1.07734656, + "epoch": 0.1822882222088619, + "flos": 29245178869920.0, + "grad_norm": 3.0467345906177012, + "language_loss": 0.73976487, + "learning_rate": 3.7617497260292625e-06, + "loss": 0.7698682, + "num_input_tokens_seen": 32138475, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.4140625, + "step": 1516, + "time_per_iteration": 3.0526771545410156 + }, + { + "auxiliary_loss_clip": 0.01588033, + "auxiliary_loss_mlp": 0.0141576, + "balance_loss_clip": 1.21456027, + "balance_loss_mlp": 1.07701492, + "epoch": 0.18240846509950098, + "flos": 17704131507360.0, + "grad_norm": 3.076769274608573, + "language_loss": 0.78645802, + "learning_rate": 3.7613808667957967e-06, + "loss": 0.81649595, + "num_input_tokens_seen": 32151165, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 3.390625, + "step": 1517, + "time_per_iteration": 2.986398220062256 + }, + { + "auxiliary_loss_clip": 0.01590935, + "auxiliary_loss_mlp": 0.01433663, + "balance_loss_clip": 1.21758926, + "balance_loss_mlp": 1.09148455, + "epoch": 0.1825287079901401, + "flos": 14791598955360.0, + "grad_norm": 2.468401236187034, + "language_loss": 0.91372865, + "learning_rate": 3.7610117403622547e-06, + "loss": 0.94397461, + "num_input_tokens_seen": 32167725, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.42578125, + "step": 1518, + "time_per_iteration": 3.1289477348327637 + }, + { + "auxiliary_loss_clip": 0.01593432, + "auxiliary_loss_mlp": 0.01439286, + "balance_loss_clip": 1.21994448, + "balance_loss_mlp": 1.10168576, + "epoch": 0.18264895088077918, + "flos": 21948486585120.0, + "grad_norm": 2.0981820079903217, + "language_loss": 0.90214825, + "learning_rate": 3.7606423467846313e-06, + "loss": 0.93247545, + "num_input_tokens_seen": 32187330, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.37890625, + "step": 1519, + "time_per_iteration": 3.033756971359253 + }, + { + "auxiliary_loss_clip": 0.01589361, + "auxiliary_loss_mlp": 0.01432801, + "balance_loss_clip": 1.21528399, + "balance_loss_mlp": 1.09615397, + "epoch": 0.18276919377141826, + "flos": 20888877013920.0, + "grad_norm": 1.714498273823384, + "language_loss": 0.79659909, + "learning_rate": 3.760272686118964e-06, + "loss": 0.82682073, + "num_input_tokens_seen": 32205550, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.36914062, + "step": 1520, + "time_per_iteration": 3.084106683731079 + }, + { + "auxiliary_loss_clip": 0.01583871, + "auxiliary_loss_mlp": 0.01430691, + "balance_loss_clip": 1.21089649, + "balance_loss_mlp": 1.08679605, + "epoch": 0.18288943666205737, + "flos": 21471990374880.0, + "grad_norm": 2.2909129754244213, + "language_loss": 0.92822373, + "learning_rate": 3.7599027584213297e-06, + "loss": 0.95836937, + "num_input_tokens_seen": 32224430, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.44335938, + "step": 1521, + "time_per_iteration": 3.1487314701080322 + }, + { + "auxiliary_loss_clip": 0.01588458, + "auxiliary_loss_mlp": 0.01416767, + "balance_loss_clip": 1.2145257, + "balance_loss_mlp": 1.07210875, + "epoch": 0.18300967955269645, + "flos": 21541285851840.0, + "grad_norm": 1.978458446068023, + "language_loss": 0.78486764, + "learning_rate": 3.7595325637478465e-06, + "loss": 0.81491995, + "num_input_tokens_seen": 32242455, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.45117188, + "step": 1522, + "time_per_iteration": 3.0265133380889893 + }, + { + "auxiliary_loss_clip": 0.01589392, + "auxiliary_loss_mlp": 0.01433535, + "balance_loss_clip": 1.21433651, + "balance_loss_mlp": 1.09669733, + "epoch": 0.18312992244333554, + "flos": 28878751272960.0, + "grad_norm": 2.8175265396604443, + "language_loss": 0.81925392, + "learning_rate": 3.7591621021546723e-06, + "loss": 0.84948325, + "num_input_tokens_seen": 32264450, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.37109375, + "step": 1523, + "time_per_iteration": 3.0914809703826904 + }, + { + "auxiliary_loss_clip": 0.01578921, + "auxiliary_loss_mlp": 0.01414254, + "balance_loss_clip": 1.20638764, + "balance_loss_mlp": 1.07245755, + "epoch": 0.18325016533397462, + "flos": 20122151608800.0, + "grad_norm": 4.967630701422645, + "language_loss": 0.8159709, + "learning_rate": 3.7587913736980062e-06, + "loss": 0.84590262, + "num_input_tokens_seen": 32284090, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.421875, + "step": 1524, + "time_per_iteration": 3.0808324813842773 + }, + { + "auxiliary_loss_clip": 0.01591602, + "auxiliary_loss_mlp": 0.01436355, + "balance_loss_clip": 1.21786404, + "balance_loss_mlp": 1.09360409, + "epoch": 0.18337040822461373, + "flos": 23331323214720.0, + "grad_norm": 2.2450490390521343, + "language_loss": 0.84396011, + "learning_rate": 3.7584203784340865e-06, + "loss": 0.87423962, + "num_input_tokens_seen": 32303260, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.43164062, + "step": 1525, + "time_per_iteration": 2.9642348289489746 + }, + { + "auxiliary_loss_clip": 0.01587749, + "auxiliary_loss_mlp": 0.01432773, + "balance_loss_clip": 1.21483088, + "balance_loss_mlp": 1.08868694, + "epoch": 0.1834906511152528, + "flos": 25011898814880.0, + "grad_norm": 2.115925555793821, + "language_loss": 0.86220932, + "learning_rate": 3.7580491164191938e-06, + "loss": 0.89241451, + "num_input_tokens_seen": 32321570, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.4453125, + "step": 1526, + "time_per_iteration": 3.0101585388183594 + }, + { + "auxiliary_loss_clip": 0.01724359, + "auxiliary_loss_mlp": 0.01375603, + "balance_loss_clip": 1.35951865, + "balance_loss_mlp": 1.05669403, + "epoch": 0.1836108940058919, + "flos": 67257766929600.0, + "grad_norm": 0.7612123993616065, + "language_loss": 0.61218286, + "learning_rate": 3.757677587709648e-06, + "loss": 0.6431824, + "num_input_tokens_seen": 32384835, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.1875, + "step": 1527, + "time_per_iteration": 4.339364767074585 + }, + { + "auxiliary_loss_clip": 0.01591191, + "auxiliary_loss_mlp": 0.01469075, + "balance_loss_clip": 1.21943879, + "balance_loss_mlp": 1.12575269, + "epoch": 0.183731136896531, + "flos": 25741074977280.0, + "grad_norm": 2.149827421237752, + "language_loss": 0.75767159, + "learning_rate": 3.7573057923618095e-06, + "loss": 0.78827423, + "num_input_tokens_seen": 32404930, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.4375, + "step": 1528, + "time_per_iteration": 2.986525774002075 + }, + { + "auxiliary_loss_clip": 0.01586124, + "auxiliary_loss_mlp": 0.0148618, + "balance_loss_clip": 1.21491861, + "balance_loss_mlp": 1.15048671, + "epoch": 0.1838513797871701, + "flos": 20451523029120.0, + "grad_norm": 2.7408386440667996, + "language_loss": 0.74519897, + "learning_rate": 3.7569337304320793e-06, + "loss": 0.775922, + "num_input_tokens_seen": 32424515, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.359375, + "step": 1529, + "time_per_iteration": 3.813488006591797 + }, + { + "auxiliary_loss_clip": 0.01745183, + "auxiliary_loss_mlp": 0.01510391, + "balance_loss_clip": 1.38089323, + "balance_loss_mlp": 1.18156433, + "epoch": 0.18397162267780917, + "flos": 68571497723040.0, + "grad_norm": 0.8727148657598671, + "language_loss": 0.64463621, + "learning_rate": 3.756561401976899e-06, + "loss": 0.67719197, + "num_input_tokens_seen": 32484220, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.2890625, + "step": 1530, + "time_per_iteration": 3.253875494003296 + }, + { + "auxiliary_loss_clip": 0.01600116, + "auxiliary_loss_mlp": 0.01451745, + "balance_loss_clip": 1.22868872, + "balance_loss_mlp": 1.11147368, + "epoch": 0.18409186556844825, + "flos": 31943794413600.0, + "grad_norm": 1.8486366777375298, + "language_loss": 0.82841593, + "learning_rate": 3.7561888070527514e-06, + "loss": 0.85893452, + "num_input_tokens_seen": 32506260, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.40625, + "step": 1531, + "time_per_iteration": 3.0915653705596924 + }, + { + "auxiliary_loss_clip": 0.01585911, + "auxiliary_loss_mlp": 0.01436137, + "balance_loss_clip": 1.21469891, + "balance_loss_mlp": 1.09262323, + "epoch": 0.18421210845908736, + "flos": 20122606746720.0, + "grad_norm": 2.4347298286359984, + "language_loss": 0.80308998, + "learning_rate": 3.7558159457161577e-06, + "loss": 0.83331043, + "num_input_tokens_seen": 32524225, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.43945312, + "step": 1532, + "time_per_iteration": 3.765170097351074 + }, + { + "auxiliary_loss_clip": 0.0160536, + "auxiliary_loss_mlp": 0.01406332, + "balance_loss_clip": 1.23451507, + "balance_loss_mlp": 1.0645349, + "epoch": 0.18433235134972645, + "flos": 23112627258240.0, + "grad_norm": 2.851395166592197, + "language_loss": 0.78155434, + "learning_rate": 3.755442818023681e-06, + "loss": 0.81167126, + "num_input_tokens_seen": 32543850, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.421875, + "step": 1533, + "time_per_iteration": 3.0526621341705322 + }, + { + "auxiliary_loss_clip": 0.01603639, + "auxiliary_loss_mlp": 0.01382205, + "balance_loss_clip": 1.2320348, + "balance_loss_mlp": 1.04250646, + "epoch": 0.18445259424036553, + "flos": 18293047876800.0, + "grad_norm": 2.8179975986854604, + "language_loss": 0.75991797, + "learning_rate": 3.7550694240319246e-06, + "loss": 0.78977644, + "num_input_tokens_seen": 32561725, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.40039062, + "step": 1534, + "time_per_iteration": 3.8741700649261475 + }, + { + "auxiliary_loss_clip": 0.0160105, + "auxiliary_loss_mlp": 0.01395047, + "balance_loss_clip": 1.23122907, + "balance_loss_mlp": 1.05649221, + "epoch": 0.18457283713100464, + "flos": 21326193070560.0, + "grad_norm": 2.2626956298357137, + "language_loss": 0.76428705, + "learning_rate": 3.7546957637975326e-06, + "loss": 0.79424804, + "num_input_tokens_seen": 32579135, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 3.38867188, + "step": 1535, + "time_per_iteration": 3.0042221546173096 + }, + { + "auxiliary_loss_clip": 0.01598732, + "auxiliary_loss_mlp": 0.01390729, + "balance_loss_clip": 1.22912347, + "balance_loss_mlp": 1.04988551, + "epoch": 0.18469308002164372, + "flos": 20377752029280.0, + "grad_norm": 1.767972365385556, + "language_loss": 0.74198359, + "learning_rate": 3.7543218373771873e-06, + "loss": 0.77187818, + "num_input_tokens_seen": 32598460, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.41210938, + "step": 1536, + "time_per_iteration": 3.0602872371673584 + }, + { + "auxiliary_loss_clip": 0.01610946, + "auxiliary_loss_mlp": 0.01413026, + "balance_loss_clip": 1.24085712, + "balance_loss_mlp": 1.07122958, + "epoch": 0.1848133229122828, + "flos": 26438504905440.0, + "grad_norm": 1.463720695513007, + "language_loss": 0.7824446, + "learning_rate": 3.753947644827615e-06, + "loss": 0.8126843, + "num_input_tokens_seen": 32621920, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.421875, + "step": 1537, + "time_per_iteration": 3.0545835494995117 + }, + { + "auxiliary_loss_clip": 0.01749135, + "auxiliary_loss_mlp": 0.01339371, + "balance_loss_clip": 1.38676739, + "balance_loss_mlp": 1.02122498, + "epoch": 0.1849335658029219, + "flos": 70554591241920.0, + "grad_norm": 0.9061622898609141, + "language_loss": 0.57141572, + "learning_rate": 3.753573186205579e-06, + "loss": 0.60230076, + "num_input_tokens_seen": 32690040, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 3.1796875, + "step": 1538, + "time_per_iteration": 3.548205614089966 + }, + { + "auxiliary_loss_clip": 0.01604506, + "auxiliary_loss_mlp": 0.01436051, + "balance_loss_clip": 1.23615456, + "balance_loss_mlp": 1.09425449, + "epoch": 0.185053808693561, + "flos": 17386062678720.0, + "grad_norm": 2.6482058102882076, + "language_loss": 0.77957582, + "learning_rate": 3.753198461567885e-06, + "loss": 0.80998147, + "num_input_tokens_seen": 32707285, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.421875, + "step": 1539, + "time_per_iteration": 2.98878812789917 + }, + { + "auxiliary_loss_clip": 0.01608718, + "auxiliary_loss_mlp": 0.01432175, + "balance_loss_clip": 1.23940945, + "balance_loss_mlp": 1.08408415, + "epoch": 0.18517405158420008, + "flos": 28988706101760.0, + "grad_norm": 1.8537496897354726, + "language_loss": 0.91811317, + "learning_rate": 3.7528234709713783e-06, + "loss": 0.94852209, + "num_input_tokens_seen": 32730030, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.484375, + "step": 1540, + "time_per_iteration": 3.1607041358947754 + }, + { + "auxiliary_loss_clip": 0.01616105, + "auxiliary_loss_mlp": 0.01543822, + "balance_loss_clip": 1.24783158, + "balance_loss_mlp": 1.21194315, + "epoch": 0.18529429447483917, + "flos": 26797271014080.0, + "grad_norm": 1.994952887968203, + "language_loss": 0.84939778, + "learning_rate": 3.7524482144729447e-06, + "loss": 0.88099706, + "num_input_tokens_seen": 32749485, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.3203125, + "step": 1541, + "time_per_iteration": 3.0831093788146973 + }, + { + "auxiliary_loss_clip": 0.01616794, + "auxiliary_loss_mlp": 0.01456523, + "balance_loss_clip": 1.24757361, + "balance_loss_mlp": 1.10957658, + "epoch": 0.18541453736547828, + "flos": 13582854401760.0, + "grad_norm": 18.94044583552159, + "language_loss": 0.83665442, + "learning_rate": 3.7520726921295106e-06, + "loss": 0.86738753, + "num_input_tokens_seen": 32766205, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.47265625, + "step": 1542, + "time_per_iteration": 2.935265064239502 + }, + { + "auxiliary_loss_clip": 0.01612164, + "auxiliary_loss_mlp": 0.01445046, + "balance_loss_clip": 1.2425909, + "balance_loss_mlp": 1.10420322, + "epoch": 0.18553478025611736, + "flos": 24027767010720.0, + "grad_norm": 2.1231000848854893, + "language_loss": 0.72833842, + "learning_rate": 3.751696903998042e-06, + "loss": 0.75891054, + "num_input_tokens_seen": 32784840, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.41210938, + "step": 1543, + "time_per_iteration": 2.944706678390503 + }, + { + "auxiliary_loss_clip": 0.01608863, + "auxiliary_loss_mlp": 0.01437773, + "balance_loss_clip": 1.24024618, + "balance_loss_mlp": 1.09330535, + "epoch": 0.18565502314675644, + "flos": 25887554988480.0, + "grad_norm": 1.8874131843729614, + "language_loss": 0.70443726, + "learning_rate": 3.7513208501355456e-06, + "loss": 0.73490357, + "num_input_tokens_seen": 32805945, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.44921875, + "step": 1544, + "time_per_iteration": 3.017982006072998 + }, + { + "auxiliary_loss_clip": 0.016154, + "auxiliary_loss_mlp": 0.01418238, + "balance_loss_clip": 1.24649072, + "balance_loss_mlp": 1.07358027, + "epoch": 0.18577526603739553, + "flos": 19612126540800.0, + "grad_norm": 1.896171207836713, + "language_loss": 0.838467, + "learning_rate": 3.750944530599069e-06, + "loss": 0.86880326, + "num_input_tokens_seen": 32825515, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.45117188, + "step": 1545, + "time_per_iteration": 3.0508344173431396 + }, + { + "auxiliary_loss_clip": 0.01613704, + "auxiliary_loss_mlp": 0.01427459, + "balance_loss_clip": 1.2449894, + "balance_loss_mlp": 1.08528066, + "epoch": 0.18589550892803464, + "flos": 18476659920960.0, + "grad_norm": 2.549371778927089, + "language_loss": 0.81446278, + "learning_rate": 3.7505679454456992e-06, + "loss": 0.84487444, + "num_input_tokens_seen": 32842125, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.42578125, + "step": 1546, + "time_per_iteration": 3.019584894180298 + }, + { + "auxiliary_loss_clip": 0.01609587, + "auxiliary_loss_mlp": 0.014248, + "balance_loss_clip": 1.24203765, + "balance_loss_mlp": 1.08204901, + "epoch": 0.18601575181867372, + "flos": 23552219004480.0, + "grad_norm": 2.5283397629804685, + "language_loss": 0.70274287, + "learning_rate": 3.750191094732564e-06, + "loss": 0.73308671, + "num_input_tokens_seen": 32862990, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.43164062, + "step": 1547, + "time_per_iteration": 2.9883198738098145 + }, + { + "auxiliary_loss_clip": 0.01608715, + "auxiliary_loss_mlp": 0.01412294, + "balance_loss_clip": 1.24075222, + "balance_loss_mlp": 1.06954336, + "epoch": 0.1861359947093128, + "flos": 26362647856800.0, + "grad_norm": 1.965907700552901, + "language_loss": 0.75638652, + "learning_rate": 3.7498139785168313e-06, + "loss": 0.78659654, + "num_input_tokens_seen": 32883595, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.43164062, + "step": 1548, + "time_per_iteration": 2.999825954437256 + }, + { + "auxiliary_loss_clip": 0.0160403, + "auxiliary_loss_mlp": 0.01386799, + "balance_loss_clip": 1.23606408, + "balance_loss_mlp": 1.04900706, + "epoch": 0.1862562375999519, + "flos": 23333447191680.0, + "grad_norm": 1.8678846385556194, + "language_loss": 0.7729305, + "learning_rate": 3.749436596855709e-06, + "loss": 0.8028388, + "num_input_tokens_seen": 32902895, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.38085938, + "step": 1549, + "time_per_iteration": 3.0718495845794678 + }, + { + "auxiliary_loss_clip": 0.01601987, + "auxiliary_loss_mlp": 0.01410306, + "balance_loss_clip": 1.23223329, + "balance_loss_mlp": 1.06984425, + "epoch": 0.186376480490591, + "flos": 16649756022240.0, + "grad_norm": 2.6787681349165813, + "language_loss": 0.90590745, + "learning_rate": 3.749058949806446e-06, + "loss": 0.93603039, + "num_input_tokens_seen": 32919620, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.40820312, + "step": 1550, + "time_per_iteration": 3.104231834411621 + }, + { + "auxiliary_loss_clip": 0.01606559, + "auxiliary_loss_mlp": 0.01401981, + "balance_loss_clip": 1.23736334, + "balance_loss_mlp": 1.06132829, + "epoch": 0.18649672338123008, + "flos": 21470700817440.0, + "grad_norm": 5.298120356867903, + "language_loss": 0.84194553, + "learning_rate": 3.748681037426331e-06, + "loss": 0.87203097, + "num_input_tokens_seen": 32938830, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.41015625, + "step": 1551, + "time_per_iteration": 3.0429728031158447 + }, + { + "auxiliary_loss_clip": 0.0161089, + "auxiliary_loss_mlp": 0.01394287, + "balance_loss_clip": 1.24275053, + "balance_loss_mlp": 1.05344391, + "epoch": 0.1866169662718692, + "flos": 12314372267520.0, + "grad_norm": 2.1525669566838053, + "language_loss": 0.92053968, + "learning_rate": 3.7483028597726936e-06, + "loss": 0.95059144, + "num_input_tokens_seen": 32955600, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.41210938, + "step": 1552, + "time_per_iteration": 2.9832444190979004 + }, + { + "auxiliary_loss_clip": 0.01605791, + "auxiliary_loss_mlp": 0.013984, + "balance_loss_clip": 1.23626328, + "balance_loss_mlp": 1.06022692, + "epoch": 0.18673720916250827, + "flos": 23583244603680.0, + "grad_norm": 2.7100355663668187, + "language_loss": 0.62667096, + "learning_rate": 3.7479244169029017e-06, + "loss": 0.65671289, + "num_input_tokens_seen": 32975390, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.38476562, + "step": 1553, + "time_per_iteration": 3.009676218032837 + }, + { + "auxiliary_loss_clip": 0.01596421, + "auxiliary_loss_mlp": 0.01379646, + "balance_loss_clip": 1.22755718, + "balance_loss_mlp": 1.04204488, + "epoch": 0.18685745205314735, + "flos": 19720374602400.0, + "grad_norm": 8.244188938436999, + "language_loss": 0.7346853, + "learning_rate": 3.7475457088743658e-06, + "loss": 0.7644459, + "num_input_tokens_seen": 32992640, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.37890625, + "step": 1554, + "time_per_iteration": 3.802406072616577 + }, + { + "auxiliary_loss_clip": 0.01613933, + "auxiliary_loss_mlp": 0.01405261, + "balance_loss_clip": 1.24530959, + "balance_loss_mlp": 1.07033062, + "epoch": 0.18697769494378644, + "flos": 34206990308640.0, + "grad_norm": 2.125689676773975, + "language_loss": 0.75043517, + "learning_rate": 3.7471667357445348e-06, + "loss": 0.78062713, + "num_input_tokens_seen": 33012470, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.3515625, + "step": 1555, + "time_per_iteration": 3.0840728282928467 + }, + { + "auxiliary_loss_clip": 0.01596873, + "auxiliary_loss_mlp": 0.01436205, + "balance_loss_clip": 1.22830629, + "balance_loss_mlp": 1.10604346, + "epoch": 0.18709793783442555, + "flos": 34244236126080.0, + "grad_norm": 2.0104716034238965, + "language_loss": 0.72290373, + "learning_rate": 3.7467874975709e-06, + "loss": 0.75323451, + "num_input_tokens_seen": 33033275, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.30273438, + "step": 1556, + "time_per_iteration": 3.9449851512908936 + }, + { + "auxiliary_loss_clip": 0.01613322, + "auxiliary_loss_mlp": 0.01508038, + "balance_loss_clip": 1.24397469, + "balance_loss_mlp": 1.18188107, + "epoch": 0.18721818072506463, + "flos": 40737753679680.0, + "grad_norm": 2.7119593087207727, + "language_loss": 0.78876865, + "learning_rate": 3.7464079944109904e-06, + "loss": 0.81998229, + "num_input_tokens_seen": 33055135, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.26171875, + "step": 1557, + "time_per_iteration": 3.1376254558563232 + }, + { + "auxiliary_loss_clip": 0.01600678, + "auxiliary_loss_mlp": 0.01545082, + "balance_loss_clip": 1.23203385, + "balance_loss_mlp": 1.22216773, + "epoch": 0.18733842361570371, + "flos": 22159217628000.0, + "grad_norm": 2.0818485329839755, + "language_loss": 0.77166116, + "learning_rate": 3.746028226322376e-06, + "loss": 0.80311882, + "num_input_tokens_seen": 33071015, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.22851562, + "step": 1558, + "time_per_iteration": 2.984222888946533 + }, + { + "auxiliary_loss_clip": 0.015994, + "auxiliary_loss_mlp": 0.01531484, + "balance_loss_clip": 1.23006654, + "balance_loss_mlp": 1.20723462, + "epoch": 0.18745866650634282, + "flos": 18916858517760.0, + "grad_norm": 1.7222237646022047, + "language_loss": 0.75439852, + "learning_rate": 3.745648193362669e-06, + "loss": 0.78570735, + "num_input_tokens_seen": 33090370, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.2421875, + "step": 1559, + "time_per_iteration": 3.0068013668060303 + }, + { + "auxiliary_loss_clip": 0.01606499, + "auxiliary_loss_mlp": 0.01494065, + "balance_loss_clip": 1.2380271, + "balance_loss_mlp": 1.16695499, + "epoch": 0.1875789093969819, + "flos": 19316473619040.0, + "grad_norm": 6.544404223323578, + "language_loss": 0.72405338, + "learning_rate": 3.745267895589518e-06, + "loss": 0.755059, + "num_input_tokens_seen": 33108910, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.27148438, + "step": 1560, + "time_per_iteration": 3.7983322143554688 + }, + { + "auxiliary_loss_clip": 0.01591711, + "auxiliary_loss_mlp": 0.0143259, + "balance_loss_clip": 1.22298932, + "balance_loss_mlp": 1.10109305, + "epoch": 0.187699152287621, + "flos": 17020848782880.0, + "grad_norm": 32.96118543899973, + "language_loss": 0.82350695, + "learning_rate": 3.7448873330606154e-06, + "loss": 0.85374993, + "num_input_tokens_seen": 33126680, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.31640625, + "step": 1561, + "time_per_iteration": 2.959077835083008 + }, + { + "auxiliary_loss_clip": 0.0159666, + "auxiliary_loss_mlp": 0.01416441, + "balance_loss_clip": 1.22689581, + "balance_loss_mlp": 1.08112884, + "epoch": 0.18781939517826007, + "flos": 22348594752480.0, + "grad_norm": 3.0217571866289714, + "language_loss": 0.875808, + "learning_rate": 3.7445065058336914e-06, + "loss": 0.90593898, + "num_input_tokens_seen": 33145550, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.35546875, + "step": 1562, + "time_per_iteration": 3.7176196575164795 + }, + { + "auxiliary_loss_clip": 0.01598639, + "auxiliary_loss_mlp": 0.0138678, + "balance_loss_clip": 1.22833407, + "balance_loss_mlp": 1.04975188, + "epoch": 0.18793963806889918, + "flos": 14613258925440.0, + "grad_norm": 2.0990188732568336, + "language_loss": 0.86259019, + "learning_rate": 3.7441254139665176e-06, + "loss": 0.89244437, + "num_input_tokens_seen": 33161735, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.37304688, + "step": 1563, + "time_per_iteration": 3.024958610534668 + }, + { + "auxiliary_loss_clip": 0.0160494, + "auxiliary_loss_mlp": 0.01379244, + "balance_loss_clip": 1.23426783, + "balance_loss_mlp": 1.04240644, + "epoch": 0.18805988095953827, + "flos": 17458999259040.0, + "grad_norm": 1.9720301200983699, + "language_loss": 0.8280068, + "learning_rate": 3.743744057516905e-06, + "loss": 0.85784864, + "num_input_tokens_seen": 33179795, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.37109375, + "step": 1564, + "time_per_iteration": 2.952645778656006 + }, + { + "auxiliary_loss_clip": 0.01587793, + "auxiliary_loss_mlp": 0.01380587, + "balance_loss_clip": 1.21755552, + "balance_loss_mlp": 1.04241455, + "epoch": 0.18818012385017735, + "flos": 15045378824160.0, + "grad_norm": 4.553249754267892, + "language_loss": 0.87922227, + "learning_rate": 3.743362436542706e-06, + "loss": 0.9089061, + "num_input_tokens_seen": 33194485, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.38476562, + "step": 1565, + "time_per_iteration": 2.9111759662628174 + }, + { + "auxiliary_loss_clip": 0.01588529, + "auxiliary_loss_mlp": 0.01412747, + "balance_loss_clip": 1.21850991, + "balance_loss_mlp": 1.07133126, + "epoch": 0.18830036674081646, + "flos": 47555256854880.0, + "grad_norm": 1.9795654031384537, + "language_loss": 0.76746261, + "learning_rate": 3.7429805511018115e-06, + "loss": 0.79747534, + "num_input_tokens_seen": 33216145, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.41796875, + "step": 1566, + "time_per_iteration": 3.1488254070281982 + }, + { + "auxiliary_loss_clip": 0.01591332, + "auxiliary_loss_mlp": 0.01506418, + "balance_loss_clip": 1.22102666, + "balance_loss_mlp": 1.17034316, + "epoch": 0.18842060963145554, + "flos": 30047253684480.0, + "grad_norm": 1.9746656668806128, + "language_loss": 0.77890193, + "learning_rate": 3.7425984012521524e-06, + "loss": 0.80987942, + "num_input_tokens_seen": 33236345, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.36328125, + "step": 1567, + "time_per_iteration": 2.967702627182007 + }, + { + "auxiliary_loss_clip": 0.01774715, + "auxiliary_loss_mlp": 0.01440208, + "balance_loss_clip": 1.40890002, + "balance_loss_mlp": 1.13884735, + "epoch": 0.18854085252209463, + "flos": 70324934047200.0, + "grad_norm": 0.7921382677930126, + "language_loss": 0.60373724, + "learning_rate": 3.7422159870517025e-06, + "loss": 0.63588655, + "num_input_tokens_seen": 33301600, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.0078125, + "step": 1568, + "time_per_iteration": 3.5171573162078857 + }, + { + "auxiliary_loss_clip": 0.01585683, + "auxiliary_loss_mlp": 0.01444755, + "balance_loss_clip": 1.21488059, + "balance_loss_mlp": 1.10200405, + "epoch": 0.1886610954127337, + "flos": 21291526368000.0, + "grad_norm": 1.6560702888501886, + "language_loss": 0.79033262, + "learning_rate": 3.7418333085584717e-06, + "loss": 0.82063699, + "num_input_tokens_seen": 33322785, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.43164062, + "step": 1569, + "time_per_iteration": 3.0658881664276123 + }, + { + "auxiliary_loss_clip": 0.01603493, + "auxiliary_loss_mlp": 0.01424181, + "balance_loss_clip": 1.23508477, + "balance_loss_mlp": 1.08295655, + "epoch": 0.18878133830337282, + "flos": 17268294648960.0, + "grad_norm": 2.1537834250477665, + "language_loss": 0.91158968, + "learning_rate": 3.7414503658305128e-06, + "loss": 0.94186652, + "num_input_tokens_seen": 33340020, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.41601562, + "step": 1570, + "time_per_iteration": 2.938969135284424 + }, + { + "auxiliary_loss_clip": 0.01607009, + "auxiliary_loss_mlp": 0.01418961, + "balance_loss_clip": 1.23800063, + "balance_loss_mlp": 1.07830906, + "epoch": 0.1889015811940119, + "flos": 25777562231520.0, + "grad_norm": 2.4105098653619184, + "language_loss": 0.7821548, + "learning_rate": 3.7410671589259185e-06, + "loss": 0.81241453, + "num_input_tokens_seen": 33358620, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.41015625, + "step": 1571, + "time_per_iteration": 2.9875295162200928 + }, + { + "auxiliary_loss_clip": 0.01603253, + "auxiliary_loss_mlp": 0.01422914, + "balance_loss_clip": 1.23577428, + "balance_loss_mlp": 1.07901883, + "epoch": 0.18902182408465099, + "flos": 21034295036640.0, + "grad_norm": 3.5171840926232583, + "language_loss": 0.79648674, + "learning_rate": 3.7406836879028205e-06, + "loss": 0.82674837, + "num_input_tokens_seen": 33378845, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.44335938, + "step": 1572, + "time_per_iteration": 2.9538369178771973 + }, + { + "auxiliary_loss_clip": 0.01600157, + "auxiliary_loss_mlp": 0.01420712, + "balance_loss_clip": 1.23191571, + "balance_loss_mlp": 1.07967854, + "epoch": 0.1891420669752901, + "flos": 22274254830240.0, + "grad_norm": 7.017275704272007, + "language_loss": 0.77026677, + "learning_rate": 3.7402999528193907e-06, + "loss": 0.80047554, + "num_input_tokens_seen": 33398345, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.4140625, + "step": 1573, + "time_per_iteration": 2.9509050846099854 + }, + { + "auxiliary_loss_clip": 0.01607616, + "auxiliary_loss_mlp": 0.01431029, + "balance_loss_clip": 1.2390213, + "balance_loss_mlp": 1.0909493, + "epoch": 0.18926230986592918, + "flos": 22019981895360.0, + "grad_norm": 3.8544729068695354, + "language_loss": 0.85995483, + "learning_rate": 3.739915953733842e-06, + "loss": 0.89034128, + "num_input_tokens_seen": 33416390, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.40429688, + "step": 1574, + "time_per_iteration": 3.0137546062469482 + }, + { + "auxiliary_loss_clip": 0.01599479, + "auxiliary_loss_mlp": 0.01409047, + "balance_loss_clip": 1.23093724, + "balance_loss_mlp": 1.06801271, + "epoch": 0.18938255275656826, + "flos": 24465007211040.0, + "grad_norm": 2.176348992654644, + "language_loss": 0.81811792, + "learning_rate": 3.7395316907044264e-06, + "loss": 0.84820324, + "num_input_tokens_seen": 33437175, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.4140625, + "step": 1575, + "time_per_iteration": 2.938988447189331 + }, + { + "auxiliary_loss_clip": 0.01603418, + "auxiliary_loss_mlp": 0.01419703, + "balance_loss_clip": 1.23520088, + "balance_loss_mlp": 1.07809722, + "epoch": 0.18950279564720737, + "flos": 24429430232640.0, + "grad_norm": 1.7588412755132286, + "language_loss": 0.79479253, + "learning_rate": 3.7391471637894364e-06, + "loss": 0.82502377, + "num_input_tokens_seen": 33459440, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 3.41992188, + "step": 1576, + "time_per_iteration": 2.9895150661468506 + }, + { + "auxiliary_loss_clip": 0.01604179, + "auxiliary_loss_mlp": 0.01413091, + "balance_loss_clip": 1.23489881, + "balance_loss_mlp": 1.07243824, + "epoch": 0.18962303853784646, + "flos": 19758378983040.0, + "grad_norm": 2.9064939601121877, + "language_loss": 0.84814751, + "learning_rate": 3.738762373047205e-06, + "loss": 0.87832016, + "num_input_tokens_seen": 33479360, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.41015625, + "step": 1577, + "time_per_iteration": 2.958533525466919 + }, + { + "auxiliary_loss_clip": 0.01609394, + "auxiliary_loss_mlp": 0.01408333, + "balance_loss_clip": 1.24045241, + "balance_loss_mlp": 1.06672657, + "epoch": 0.18974328142848554, + "flos": 21034143324000.0, + "grad_norm": 1.8043170424072095, + "language_loss": 0.83517367, + "learning_rate": 3.738377318536103e-06, + "loss": 0.86535084, + "num_input_tokens_seen": 33499245, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.41992188, + "step": 1578, + "time_per_iteration": 2.9197115898132324 + }, + { + "auxiliary_loss_clip": 0.01603083, + "auxiliary_loss_mlp": 0.01405696, + "balance_loss_clip": 1.23425508, + "balance_loss_mlp": 1.06180131, + "epoch": 0.18986352431912462, + "flos": 12967615524960.0, + "grad_norm": 2.28782309981087, + "language_loss": 0.71831214, + "learning_rate": 3.7379920003145447e-06, + "loss": 0.74839997, + "num_input_tokens_seen": 33513520, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.44140625, + "step": 1579, + "time_per_iteration": 3.058805227279663 + }, + { + "auxiliary_loss_clip": 0.01614283, + "auxiliary_loss_mlp": 0.01399662, + "balance_loss_clip": 1.24545145, + "balance_loss_mlp": 1.05214357, + "epoch": 0.18998376720976373, + "flos": 23770118469600.0, + "grad_norm": 1.7048228540071402, + "language_loss": 0.83919072, + "learning_rate": 3.7376064184409817e-06, + "loss": 0.86933017, + "num_input_tokens_seen": 33533100, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.48046875, + "step": 1580, + "time_per_iteration": 2.956239700317383 + }, + { + "auxiliary_loss_clip": 0.01606067, + "auxiliary_loss_mlp": 0.01390059, + "balance_loss_clip": 1.23810756, + "balance_loss_mlp": 1.04826248, + "epoch": 0.19010401010040281, + "flos": 22968233295840.0, + "grad_norm": 1.533758882596753, + "language_loss": 0.86961108, + "learning_rate": 3.7372205729739063e-06, + "loss": 0.89957231, + "num_input_tokens_seen": 33554915, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.421875, + "step": 1581, + "time_per_iteration": 2.9511234760284424 + }, + { + "auxiliary_loss_clip": 0.01608491, + "auxiliary_loss_mlp": 0.01383622, + "balance_loss_clip": 1.23940802, + "balance_loss_mlp": 1.04087138, + "epoch": 0.1902242529910419, + "flos": 19137678451200.0, + "grad_norm": 2.7418268390591796, + "language_loss": 0.71830517, + "learning_rate": 3.7368344639718514e-06, + "loss": 0.74822628, + "num_input_tokens_seen": 33572850, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.43164062, + "step": 1582, + "time_per_iteration": 3.759244918823242 + }, + { + "auxiliary_loss_clip": 0.01593588, + "auxiliary_loss_mlp": 0.01400313, + "balance_loss_clip": 1.22396827, + "balance_loss_mlp": 1.05679965, + "epoch": 0.190344495881681, + "flos": 25486308976320.0, + "grad_norm": 1.6341274858286658, + "language_loss": 0.80489564, + "learning_rate": 3.7364480914933895e-06, + "loss": 0.83483469, + "num_input_tokens_seen": 33593090, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.43945312, + "step": 1583, + "time_per_iteration": 3.8413798809051514 + }, + { + "auxiliary_loss_clip": 0.01605661, + "auxiliary_loss_mlp": 0.01409739, + "balance_loss_clip": 1.23532081, + "balance_loss_mlp": 1.06965864, + "epoch": 0.1904647387723201, + "flos": 26795033252640.0, + "grad_norm": 9.08276769484289, + "language_loss": 0.80722493, + "learning_rate": 3.7360614555971325e-06, + "loss": 0.83737892, + "num_input_tokens_seen": 33612745, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.40429688, + "step": 1584, + "time_per_iteration": 3.0148043632507324 + }, + { + "auxiliary_loss_clip": 0.01593905, + "auxiliary_loss_mlp": 0.01413357, + "balance_loss_clip": 1.22347617, + "balance_loss_mlp": 1.08186007, + "epoch": 0.19058498166295917, + "flos": 23990028127200.0, + "grad_norm": 1.953074788798975, + "language_loss": 0.8499282, + "learning_rate": 3.735674556341733e-06, + "loss": 0.88000083, + "num_input_tokens_seen": 33632360, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.31640625, + "step": 1585, + "time_per_iteration": 2.946767568588257 + }, + { + "auxiliary_loss_clip": 0.0159827, + "auxiliary_loss_mlp": 0.01457819, + "balance_loss_clip": 1.22816288, + "balance_loss_mlp": 1.12193489, + "epoch": 0.19070522455359826, + "flos": 28295827552800.0, + "grad_norm": 2.0549879739895065, + "language_loss": 0.83113235, + "learning_rate": 3.7352873937858835e-06, + "loss": 0.86169326, + "num_input_tokens_seen": 33653895, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.36132812, + "step": 1586, + "time_per_iteration": 2.95202898979187 + }, + { + "auxiliary_loss_clip": 0.01590119, + "auxiliary_loss_mlp": 0.01486265, + "balance_loss_clip": 1.2191323, + "balance_loss_mlp": 1.14237022, + "epoch": 0.19082546744423737, + "flos": 25662259532160.0, + "grad_norm": 2.0870175440046643, + "language_loss": 0.71979654, + "learning_rate": 3.734899967988316e-06, + "loss": 0.75056034, + "num_input_tokens_seen": 33672075, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.44335938, + "step": 1587, + "time_per_iteration": 3.778632640838623 + }, + { + "auxiliary_loss_clip": 0.01597985, + "auxiliary_loss_mlp": 0.0151986, + "balance_loss_clip": 1.22878599, + "balance_loss_mlp": 1.19007945, + "epoch": 0.19094571033487645, + "flos": 19721550375360.0, + "grad_norm": 1.9534840475312305, + "language_loss": 0.84130561, + "learning_rate": 3.7345122790078026e-06, + "loss": 0.87248403, + "num_input_tokens_seen": 33689640, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.29882812, + "step": 1588, + "time_per_iteration": 2.914274215698242 + }, + { + "auxiliary_loss_clip": 0.0159069, + "auxiliary_loss_mlp": 0.01494001, + "balance_loss_clip": 1.22027588, + "balance_loss_mlp": 1.15601897, + "epoch": 0.19106595322551553, + "flos": 21618470386080.0, + "grad_norm": 31.772888591938166, + "language_loss": 0.9336803, + "learning_rate": 3.7341243269031556e-06, + "loss": 0.96452725, + "num_input_tokens_seen": 33708630, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 3.3828125, + "step": 1589, + "time_per_iteration": 3.7970492839813232 + }, + { + "auxiliary_loss_clip": 0.01602016, + "auxiliary_loss_mlp": 0.01509366, + "balance_loss_clip": 1.23054886, + "balance_loss_mlp": 1.17233753, + "epoch": 0.19118619611615464, + "flos": 29899408259520.0, + "grad_norm": 1.6822854781952787, + "language_loss": 0.77325869, + "learning_rate": 3.7337361117332275e-06, + "loss": 0.80437249, + "num_input_tokens_seen": 33730370, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.37304688, + "step": 1590, + "time_per_iteration": 3.090538740158081 + }, + { + "auxiliary_loss_clip": 0.01597758, + "auxiliary_loss_mlp": 0.01484282, + "balance_loss_clip": 1.22630501, + "balance_loss_mlp": 1.14191246, + "epoch": 0.19130643900679373, + "flos": 17275273430400.0, + "grad_norm": 2.3130641081924783, + "language_loss": 0.76965117, + "learning_rate": 3.7333476335569087e-06, + "loss": 0.80047154, + "num_input_tokens_seen": 33748370, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.42578125, + "step": 1591, + "time_per_iteration": 3.0013182163238525 + }, + { + "auxiliary_loss_clip": 0.015901, + "auxiliary_loss_mlp": 0.0144091, + "balance_loss_clip": 1.21900773, + "balance_loss_mlp": 1.0966332, + "epoch": 0.1914266818974328, + "flos": 24828628124160.0, + "grad_norm": 2.333099844650637, + "language_loss": 0.67290932, + "learning_rate": 3.7329588924331325e-06, + "loss": 0.70321941, + "num_input_tokens_seen": 33769575, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.4453125, + "step": 1592, + "time_per_iteration": 2.959489107131958 + }, + { + "auxiliary_loss_clip": 0.01594733, + "auxiliary_loss_mlp": 0.01409027, + "balance_loss_clip": 1.22281039, + "balance_loss_mlp": 1.07047248, + "epoch": 0.1915469247880719, + "flos": 18954445688640.0, + "grad_norm": 1.8757003801656504, + "language_loss": 0.82668221, + "learning_rate": 3.732569888420871e-06, + "loss": 0.85671985, + "num_input_tokens_seen": 33789110, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.38867188, + "step": 1593, + "time_per_iteration": 2.9710938930511475 + }, + { + "auxiliary_loss_clip": 0.01588034, + "auxiliary_loss_mlp": 0.01396278, + "balance_loss_clip": 1.21617508, + "balance_loss_mlp": 1.05924916, + "epoch": 0.191667167678711, + "flos": 21035091528000.0, + "grad_norm": 2.8621569929646737, + "language_loss": 0.82594025, + "learning_rate": 3.732180621579134e-06, + "loss": 0.85578334, + "num_input_tokens_seen": 33808325, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.37304688, + "step": 1594, + "time_per_iteration": 2.910599708557129 + }, + { + "auxiliary_loss_clip": 0.01604776, + "auxiliary_loss_mlp": 0.0142552, + "balance_loss_clip": 1.23403311, + "balance_loss_mlp": 1.09116137, + "epoch": 0.1917874105693501, + "flos": 34240367453760.0, + "grad_norm": 1.8870852053768192, + "language_loss": 0.81225681, + "learning_rate": 3.7317910919669745e-06, + "loss": 0.84255981, + "num_input_tokens_seen": 33829520, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.34570312, + "step": 1595, + "time_per_iteration": 3.039351463317871 + }, + { + "auxiliary_loss_clip": 0.0159643, + "auxiliary_loss_mlp": 0.01401256, + "balance_loss_clip": 1.2255919, + "balance_loss_mlp": 1.06632543, + "epoch": 0.19190765345998917, + "flos": 23552901711360.0, + "grad_norm": 2.435635794096029, + "language_loss": 0.76569545, + "learning_rate": 3.7314012996434826e-06, + "loss": 0.79567236, + "num_input_tokens_seen": 33848250, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.3515625, + "step": 1596, + "time_per_iteration": 2.932352304458618 + }, + { + "auxiliary_loss_clip": 0.01590706, + "auxiliary_loss_mlp": 0.01410769, + "balance_loss_clip": 1.21749055, + "balance_loss_mlp": 1.07126093, + "epoch": 0.19202789635062828, + "flos": 19863175582080.0, + "grad_norm": 1.9803145945584415, + "language_loss": 0.81203604, + "learning_rate": 3.7310112446677907e-06, + "loss": 0.84205079, + "num_input_tokens_seen": 33866160, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.3984375, + "step": 1597, + "time_per_iteration": 2.918344736099243 + }, + { + "auxiliary_loss_clip": 0.01595411, + "auxiliary_loss_mlp": 0.01404521, + "balance_loss_clip": 1.22305381, + "balance_loss_mlp": 1.06367803, + "epoch": 0.19214813924126736, + "flos": 20925060842880.0, + "grad_norm": 2.238883144055062, + "language_loss": 0.68868113, + "learning_rate": 3.7306209270990695e-06, + "loss": 0.71868038, + "num_input_tokens_seen": 33884165, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.41210938, + "step": 1598, + "time_per_iteration": 2.95771861076355 + }, + { + "auxiliary_loss_clip": 0.0158975, + "auxiliary_loss_mlp": 0.01395355, + "balance_loss_clip": 1.21815562, + "balance_loss_mlp": 1.05985236, + "epoch": 0.19226838213190645, + "flos": 26362154790720.0, + "grad_norm": 2.1576981169139526, + "language_loss": 0.86655825, + "learning_rate": 3.7302303469965292e-06, + "loss": 0.89640927, + "num_input_tokens_seen": 33903705, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.35742188, + "step": 1599, + "time_per_iteration": 2.956829071044922 + }, + { + "auxiliary_loss_clip": 0.01585316, + "auxiliary_loss_mlp": 0.01408962, + "balance_loss_clip": 1.21277153, + "balance_loss_mlp": 1.06830907, + "epoch": 0.19238862502254553, + "flos": 20852655256800.0, + "grad_norm": 1.821471619728899, + "language_loss": 0.70799553, + "learning_rate": 3.7298395044194206e-06, + "loss": 0.73793828, + "num_input_tokens_seen": 33922515, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.41015625, + "step": 1600, + "time_per_iteration": 2.92624831199646 + }, + { + "auxiliary_loss_clip": 0.01592061, + "auxiliary_loss_mlp": 0.01407087, + "balance_loss_clip": 1.22000098, + "balance_loss_mlp": 1.0671978, + "epoch": 0.19250886791318464, + "flos": 21728235574080.0, + "grad_norm": 2.184548129570734, + "language_loss": 0.94455647, + "learning_rate": 3.7294483994270356e-06, + "loss": 0.97454792, + "num_input_tokens_seen": 33940840, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.40234375, + "step": 1601, + "time_per_iteration": 2.9554696083068848 + }, + { + "auxiliary_loss_clip": 0.01583909, + "auxiliary_loss_mlp": 0.01398894, + "balance_loss_clip": 1.21065545, + "balance_loss_mlp": 1.06281948, + "epoch": 0.19262911080382372, + "flos": 23370086158560.0, + "grad_norm": 2.0229597463272935, + "language_loss": 0.78356361, + "learning_rate": 3.7290570320787033e-06, + "loss": 0.81339163, + "num_input_tokens_seen": 33960420, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.36328125, + "step": 1602, + "time_per_iteration": 3.1193666458129883 + }, + { + "auxiliary_loss_clip": 0.01578097, + "auxiliary_loss_mlp": 0.01405475, + "balance_loss_clip": 1.20631599, + "balance_loss_mlp": 1.0684464, + "epoch": 0.1927493536944628, + "flos": 21945604044960.0, + "grad_norm": 2.9896023189236156, + "language_loss": 0.71904081, + "learning_rate": 3.728665402433793e-06, + "loss": 0.74887657, + "num_input_tokens_seen": 33978990, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 3.37304688, + "step": 1603, + "time_per_iteration": 3.007516622543335 + }, + { + "auxiliary_loss_clip": 0.01588485, + "auxiliary_loss_mlp": 0.01402057, + "balance_loss_clip": 1.21596169, + "balance_loss_mlp": 1.06884289, + "epoch": 0.19286959658510192, + "flos": 16547500609920.0, + "grad_norm": 2.4344206772015555, + "language_loss": 0.86305392, + "learning_rate": 3.7282735105517164e-06, + "loss": 0.89295936, + "num_input_tokens_seen": 33997115, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.33398438, + "step": 1604, + "time_per_iteration": 2.972094774246216 + }, + { + "auxiliary_loss_clip": 0.01582838, + "auxiliary_loss_mlp": 0.01392455, + "balance_loss_clip": 1.21102488, + "balance_loss_mlp": 1.05695224, + "epoch": 0.192989839475741, + "flos": 21619153092960.0, + "grad_norm": 2.3544553878254897, + "language_loss": 0.67626256, + "learning_rate": 3.727881356491922e-06, + "loss": 0.70601547, + "num_input_tokens_seen": 34015525, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.35742188, + "step": 1605, + "time_per_iteration": 2.935819625854492 + }, + { + "auxiliary_loss_clip": 0.01583856, + "auxiliary_loss_mlp": 0.01417069, + "balance_loss_clip": 1.21249533, + "balance_loss_mlp": 1.07508135, + "epoch": 0.19311008236638008, + "flos": 19283551611840.0, + "grad_norm": 2.3930201381818588, + "language_loss": 0.75763643, + "learning_rate": 3.7274889403139002e-06, + "loss": 0.78764564, + "num_input_tokens_seen": 34033150, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 3.42382812, + "step": 1606, + "time_per_iteration": 2.920012950897217 + }, + { + "auxiliary_loss_clip": 0.01581721, + "auxiliary_loss_mlp": 0.01501103, + "balance_loss_clip": 1.20926166, + "balance_loss_mlp": 1.16312087, + "epoch": 0.1932303252570192, + "flos": 28654593661440.0, + "grad_norm": 3.1583776982886915, + "language_loss": 0.78834093, + "learning_rate": 3.727096262077179e-06, + "loss": 0.81916916, + "num_input_tokens_seen": 34052145, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.3828125, + "step": 1607, + "time_per_iteration": 2.9639453887939453 + }, + { + "auxiliary_loss_clip": 0.01585262, + "auxiliary_loss_mlp": 0.01495028, + "balance_loss_clip": 1.21329689, + "balance_loss_mlp": 1.1553297, + "epoch": 0.19335056814765827, + "flos": 18370953046080.0, + "grad_norm": 1.8696662869196934, + "language_loss": 0.85493743, + "learning_rate": 3.7267033218413285e-06, + "loss": 0.88574028, + "num_input_tokens_seen": 34069940, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.3984375, + "step": 1608, + "time_per_iteration": 2.984520435333252 + }, + { + "auxiliary_loss_clip": 0.01580412, + "auxiliary_loss_mlp": 0.0141482, + "balance_loss_clip": 1.20716512, + "balance_loss_mlp": 1.07683754, + "epoch": 0.19347081103829736, + "flos": 13262547811680.0, + "grad_norm": 2.4424287801039473, + "language_loss": 0.81090522, + "learning_rate": 3.726310119665957e-06, + "loss": 0.84085757, + "num_input_tokens_seen": 34086275, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.3828125, + "step": 1609, + "time_per_iteration": 2.9628171920776367 + }, + { + "auxiliary_loss_clip": 0.0158021, + "auxiliary_loss_mlp": 0.01386829, + "balance_loss_clip": 1.2088002, + "balance_loss_mlp": 1.04999077, + "epoch": 0.19359105392893644, + "flos": 20302046693280.0, + "grad_norm": 1.9754977478279656, + "language_loss": 0.85583377, + "learning_rate": 3.725916655610713e-06, + "loss": 0.88550419, + "num_input_tokens_seen": 34105605, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.37109375, + "step": 1610, + "time_per_iteration": 3.807673454284668 + }, + { + "auxiliary_loss_clip": 0.01582615, + "auxiliary_loss_mlp": 0.01394274, + "balance_loss_clip": 1.2103461, + "balance_loss_mlp": 1.05476546, + "epoch": 0.19371129681957555, + "flos": 20486644869600.0, + "grad_norm": 3.388859704538995, + "language_loss": 0.75607133, + "learning_rate": 3.725522929735284e-06, + "loss": 0.78584027, + "num_input_tokens_seen": 34122540, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.3984375, + "step": 1611, + "time_per_iteration": 3.8380954265594482 + }, + { + "auxiliary_loss_clip": 0.01574066, + "auxiliary_loss_mlp": 0.01392022, + "balance_loss_clip": 1.20303059, + "balance_loss_mlp": 1.05289507, + "epoch": 0.19383153971021463, + "flos": 30448120415040.0, + "grad_norm": 2.6125451884361146, + "language_loss": 0.74565572, + "learning_rate": 3.725128942099399e-06, + "loss": 0.7753166, + "num_input_tokens_seen": 34142940, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 3.39453125, + "step": 1612, + "time_per_iteration": 2.9772727489471436 + }, + { + "auxiliary_loss_clip": 0.01570102, + "auxiliary_loss_mlp": 0.01394728, + "balance_loss_clip": 1.19780552, + "balance_loss_mlp": 1.05655551, + "epoch": 0.19395178260085372, + "flos": 24572155356000.0, + "grad_norm": 4.216297971612611, + "language_loss": 0.80070204, + "learning_rate": 3.7247346927628245e-06, + "loss": 0.83035034, + "num_input_tokens_seen": 34162875, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.38476562, + "step": 1613, + "time_per_iteration": 3.0377066135406494 + }, + { + "auxiliary_loss_clip": 0.01576234, + "auxiliary_loss_mlp": 0.0140642, + "balance_loss_clip": 1.20490503, + "balance_loss_mlp": 1.06347835, + "epoch": 0.19407202549149283, + "flos": 28953204979680.0, + "grad_norm": 1.7716861358521314, + "language_loss": 0.78973752, + "learning_rate": 3.7243401817853694e-06, + "loss": 0.81956404, + "num_input_tokens_seen": 34183565, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.43359375, + "step": 1614, + "time_per_iteration": 3.924429416656494 + }, + { + "auxiliary_loss_clip": 0.01578235, + "auxiliary_loss_mlp": 0.01417144, + "balance_loss_clip": 1.20476139, + "balance_loss_mlp": 1.07363009, + "epoch": 0.1941922683821319, + "flos": 18006459785280.0, + "grad_norm": 1.8549662915839948, + "language_loss": 0.72238743, + "learning_rate": 3.723945409226879e-06, + "loss": 0.75234115, + "num_input_tokens_seen": 34202055, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.43945312, + "step": 1615, + "time_per_iteration": 3.0468766689300537 + }, + { + "auxiliary_loss_clip": 0.01577562, + "auxiliary_loss_mlp": 0.01533512, + "balance_loss_clip": 1.20537806, + "balance_loss_mlp": 1.20106149, + "epoch": 0.194312511272771, + "flos": 9721729095840.0, + "grad_norm": 3.260412673266755, + "language_loss": 0.79790568, + "learning_rate": 3.723550375147241e-06, + "loss": 0.82901645, + "num_input_tokens_seen": 34216830, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.32617188, + "step": 1616, + "time_per_iteration": 2.9552130699157715 + }, + { + "auxiliary_loss_clip": 0.01581076, + "auxiliary_loss_mlp": 0.01480883, + "balance_loss_clip": 1.20921004, + "balance_loss_mlp": 1.14156592, + "epoch": 0.19443275416341008, + "flos": 27018659869920.0, + "grad_norm": 1.8879482987630247, + "language_loss": 0.80288196, + "learning_rate": 3.7231550796063816e-06, + "loss": 0.83350152, + "num_input_tokens_seen": 34236840, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 3.39648438, + "step": 1617, + "time_per_iteration": 3.0210115909576416 + }, + { + "auxiliary_loss_clip": 0.01576046, + "auxiliary_loss_mlp": 0.01408619, + "balance_loss_clip": 1.20320868, + "balance_loss_mlp": 1.06644022, + "epoch": 0.1945529970540492, + "flos": 15848174273760.0, + "grad_norm": 1.9563468427307722, + "language_loss": 0.64846641, + "learning_rate": 3.722759522664266e-06, + "loss": 0.67831302, + "num_input_tokens_seen": 34254140, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.42578125, + "step": 1618, + "time_per_iteration": 3.7829842567443848 + }, + { + "auxiliary_loss_clip": 0.01580898, + "auxiliary_loss_mlp": 0.01388612, + "balance_loss_clip": 1.20746386, + "balance_loss_mlp": 1.0555886, + "epoch": 0.19467323994468827, + "flos": 19316852900640.0, + "grad_norm": 5.729727651026523, + "language_loss": 0.82147133, + "learning_rate": 3.7223637043809016e-06, + "loss": 0.85116649, + "num_input_tokens_seen": 34273120, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.33203125, + "step": 1619, + "time_per_iteration": 3.0360653400421143 + }, + { + "auxiliary_loss_clip": 0.01580395, + "auxiliary_loss_mlp": 0.01408552, + "balance_loss_clip": 1.20555902, + "balance_loss_mlp": 1.07228637, + "epoch": 0.19479348283532735, + "flos": 24135484078080.0, + "grad_norm": 2.6434974382657144, + "language_loss": 0.86677724, + "learning_rate": 3.7219676248163322e-06, + "loss": 0.89666677, + "num_input_tokens_seen": 34290285, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.36523438, + "step": 1620, + "time_per_iteration": 2.943288564682007 + }, + { + "auxiliary_loss_clip": 0.01580881, + "auxiliary_loss_mlp": 0.01418227, + "balance_loss_clip": 1.20646453, + "balance_loss_mlp": 1.07967234, + "epoch": 0.19491372572596646, + "flos": 25778017369440.0, + "grad_norm": 2.710562086676186, + "language_loss": 0.93711197, + "learning_rate": 3.721571284030643e-06, + "loss": 0.967103, + "num_input_tokens_seen": 34310095, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.38867188, + "step": 1621, + "time_per_iteration": 2.963467597961426 + }, + { + "auxiliary_loss_clip": 0.015791, + "auxiliary_loss_mlp": 0.01420551, + "balance_loss_clip": 1.20350409, + "balance_loss_mlp": 1.07989883, + "epoch": 0.19503396861660555, + "flos": 19647210453120.0, + "grad_norm": 2.21940552571802, + "language_loss": 0.79196775, + "learning_rate": 3.7211746820839587e-06, + "loss": 0.8219642, + "num_input_tokens_seen": 34327190, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.41015625, + "step": 1622, + "time_per_iteration": 2.9247238636016846 + }, + { + "auxiliary_loss_clip": 0.01573691, + "auxiliary_loss_mlp": 0.01446247, + "balance_loss_clip": 1.198632, + "balance_loss_mlp": 1.10826445, + "epoch": 0.19515421150724463, + "flos": 21035167384320.0, + "grad_norm": 2.5905494404183886, + "language_loss": 0.81286919, + "learning_rate": 3.7207778190364437e-06, + "loss": 0.84306854, + "num_input_tokens_seen": 34345615, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.3828125, + "step": 1623, + "time_per_iteration": 2.910376787185669 + }, + { + "auxiliary_loss_clip": 0.01587858, + "auxiliary_loss_mlp": 0.01494421, + "balance_loss_clip": 1.21094084, + "balance_loss_mlp": 1.14919066, + "epoch": 0.1952744543978837, + "flos": 32963920405920.0, + "grad_norm": 2.122033488601802, + "language_loss": 0.73969108, + "learning_rate": 3.720380694948302e-06, + "loss": 0.77051383, + "num_input_tokens_seen": 34368500, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.45507812, + "step": 1624, + "time_per_iteration": 3.0734121799468994 + }, + { + "auxiliary_loss_clip": 0.01750441, + "auxiliary_loss_mlp": 0.01528175, + "balance_loss_clip": 1.37353492, + "balance_loss_mlp": 1.19553375, + "epoch": 0.19539469728852282, + "flos": 64050605516160.0, + "grad_norm": 1.0672067699087437, + "language_loss": 0.71160209, + "learning_rate": 3.719983309879777e-06, + "loss": 0.74438822, + "num_input_tokens_seen": 34428280, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.328125, + "step": 1625, + "time_per_iteration": 3.438957929611206 + }, + { + "auxiliary_loss_clip": 0.01589181, + "auxiliary_loss_mlp": 0.01422764, + "balance_loss_clip": 1.21508622, + "balance_loss_mlp": 1.09107614, + "epoch": 0.1955149401791619, + "flos": 13372616424960.0, + "grad_norm": 2.0609614136476515, + "language_loss": 0.77727592, + "learning_rate": 3.719585663891151e-06, + "loss": 0.80739534, + "num_input_tokens_seen": 34445815, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.31835938, + "step": 1626, + "time_per_iteration": 2.9337120056152344 + }, + { + "auxiliary_loss_clip": 0.01590587, + "auxiliary_loss_mlp": 0.01416664, + "balance_loss_clip": 1.216079, + "balance_loss_mlp": 1.07124352, + "epoch": 0.195635183069801, + "flos": 18730591502400.0, + "grad_norm": 2.8001618232162713, + "language_loss": 0.79523897, + "learning_rate": 3.719187757042747e-06, + "loss": 0.82531154, + "num_input_tokens_seen": 34463635, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.45898438, + "step": 1627, + "time_per_iteration": 3.153069257736206 + }, + { + "auxiliary_loss_clip": 0.01748396, + "auxiliary_loss_mlp": 0.01390427, + "balance_loss_clip": 1.36948395, + "balance_loss_mlp": 1.0821991, + "epoch": 0.1957554259604401, + "flos": 69321420588960.0, + "grad_norm": 0.7624528948549734, + "language_loss": 0.54919666, + "learning_rate": 3.7187895893949275e-06, + "loss": 0.58058488, + "num_input_tokens_seen": 34530105, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.078125, + "step": 1628, + "time_per_iteration": 3.4802470207214355 + }, + { + "auxiliary_loss_clip": 0.01565907, + "auxiliary_loss_mlp": 0.01390818, + "balance_loss_clip": 1.18994057, + "balance_loss_mlp": 1.0513103, + "epoch": 0.19587566885107918, + "flos": 21071389141440.0, + "grad_norm": 7.515142853258965, + "language_loss": 0.76219684, + "learning_rate": 3.7183911610080937e-06, + "loss": 0.79176414, + "num_input_tokens_seen": 34546970, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.3984375, + "step": 1629, + "time_per_iteration": 3.0297634601593018 + }, + { + "auxiliary_loss_clip": 0.0158128, + "auxiliary_loss_mlp": 0.01386978, + "balance_loss_clip": 1.20540547, + "balance_loss_mlp": 1.0493778, + "epoch": 0.19599591174171827, + "flos": 22196539301760.0, + "grad_norm": 2.866348361558195, + "language_loss": 0.75053608, + "learning_rate": 3.7179924719426872e-06, + "loss": 0.78021872, + "num_input_tokens_seen": 34564865, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.37890625, + "step": 1630, + "time_per_iteration": 3.0202229022979736 + }, + { + "auxiliary_loss_clip": 0.01576067, + "auxiliary_loss_mlp": 0.01390063, + "balance_loss_clip": 1.20100486, + "balance_loss_mlp": 1.05494237, + "epoch": 0.19611615463235738, + "flos": 23770611535680.0, + "grad_norm": 2.7072954797579767, + "language_loss": 0.75870466, + "learning_rate": 3.7175935222591885e-06, + "loss": 0.78836596, + "num_input_tokens_seen": 34584165, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.35351562, + "step": 1631, + "time_per_iteration": 2.9999053478240967 + }, + { + "auxiliary_loss_clip": 0.01581253, + "auxiliary_loss_mlp": 0.01440107, + "balance_loss_clip": 1.20484853, + "balance_loss_mlp": 1.10460472, + "epoch": 0.19623639752299646, + "flos": 28620571737600.0, + "grad_norm": 1.9278527022213399, + "language_loss": 0.74316543, + "learning_rate": 3.717194312018118e-06, + "loss": 0.77337903, + "num_input_tokens_seen": 34603150, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.35742188, + "step": 1632, + "time_per_iteration": 3.1491734981536865 + }, + { + "auxiliary_loss_clip": 0.01575132, + "auxiliary_loss_mlp": 0.01475689, + "balance_loss_clip": 1.19848037, + "balance_loss_mlp": 1.1441915, + "epoch": 0.19635664041363554, + "flos": 21034826030880.0, + "grad_norm": 3.0225859330613947, + "language_loss": 0.76258814, + "learning_rate": 3.716794841280036e-06, + "loss": 0.7930963, + "num_input_tokens_seen": 34621855, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.31640625, + "step": 1633, + "time_per_iteration": 2.972025156021118 + }, + { + "auxiliary_loss_clip": 0.0157234, + "auxiliary_loss_mlp": 0.01473475, + "balance_loss_clip": 1.19558549, + "balance_loss_mlp": 1.1370182, + "epoch": 0.19647688330427462, + "flos": 18881736677280.0, + "grad_norm": 2.060601728161914, + "language_loss": 0.77700317, + "learning_rate": 3.7163951101055407e-06, + "loss": 0.80746132, + "num_input_tokens_seen": 34639915, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.3671875, + "step": 1634, + "time_per_iteration": 2.950181245803833 + }, + { + "auxiliary_loss_clip": 0.01577422, + "auxiliary_loss_mlp": 0.01460876, + "balance_loss_clip": 1.20058644, + "balance_loss_mlp": 1.12575507, + "epoch": 0.19659712619491373, + "flos": 24244756200000.0, + "grad_norm": 2.1112130278458956, + "language_loss": 0.79185116, + "learning_rate": 3.715995118555273e-06, + "loss": 0.82223415, + "num_input_tokens_seen": 34659890, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.35351562, + "step": 1635, + "time_per_iteration": 2.990326404571533 + }, + { + "auxiliary_loss_clip": 0.01580638, + "auxiliary_loss_mlp": 0.01385749, + "balance_loss_clip": 1.20465755, + "balance_loss_mlp": 1.05119991, + "epoch": 0.19671736908555282, + "flos": 24719735283840.0, + "grad_norm": 1.9787319633714973, + "language_loss": 0.86180794, + "learning_rate": 3.71559486668991e-06, + "loss": 0.89147174, + "num_input_tokens_seen": 34678750, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.34765625, + "step": 1636, + "time_per_iteration": 2.994795560836792 + }, + { + "auxiliary_loss_clip": 0.01583439, + "auxiliary_loss_mlp": 0.01403401, + "balance_loss_clip": 1.20967329, + "balance_loss_mlp": 1.05778956, + "epoch": 0.1968376119761919, + "flos": 23844572176320.0, + "grad_norm": 1.7794408950560625, + "language_loss": 0.7713052, + "learning_rate": 3.715194354570169e-06, + "loss": 0.80117363, + "num_input_tokens_seen": 34698755, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.4609375, + "step": 1637, + "time_per_iteration": 3.8545775413513184 + }, + { + "auxiliary_loss_clip": 0.01573134, + "auxiliary_loss_mlp": 0.0140426, + "balance_loss_clip": 1.19711089, + "balance_loss_mlp": 1.0603646, + "epoch": 0.196957854866831, + "flos": 18115883619840.0, + "grad_norm": 2.363489243527166, + "language_loss": 0.83495033, + "learning_rate": 3.714793582256809e-06, + "loss": 0.86472422, + "num_input_tokens_seen": 34715820, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.44335938, + "step": 1638, + "time_per_iteration": 3.909761905670166 + }, + { + "auxiliary_loss_clip": 0.01581015, + "auxiliary_loss_mlp": 0.01429033, + "balance_loss_clip": 1.20452213, + "balance_loss_mlp": 1.08113253, + "epoch": 0.1970780977574701, + "flos": 21655298993760.0, + "grad_norm": 3.42913122815024, + "language_loss": 0.85007721, + "learning_rate": 3.7143925498106253e-06, + "loss": 0.88017774, + "num_input_tokens_seen": 34734360, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.484375, + "step": 1639, + "time_per_iteration": 2.9351913928985596 + }, + { + "auxiliary_loss_clip": 0.01579535, + "auxiliary_loss_mlp": 0.01438923, + "balance_loss_clip": 1.20254421, + "balance_loss_mlp": 1.09426522, + "epoch": 0.19719834064810918, + "flos": 20813816456640.0, + "grad_norm": 2.214737137810094, + "language_loss": 0.79160714, + "learning_rate": 3.7139912572924558e-06, + "loss": 0.82179171, + "num_input_tokens_seen": 34753390, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.45117188, + "step": 1640, + "time_per_iteration": 2.9797048568725586 + }, + { + "auxiliary_loss_clip": 0.01573182, + "auxiliary_loss_mlp": 0.01456628, + "balance_loss_clip": 1.19591522, + "balance_loss_mlp": 1.11044383, + "epoch": 0.19731858353874826, + "flos": 23436726664320.0, + "grad_norm": 5.333892325705957, + "language_loss": 0.80705273, + "learning_rate": 3.7135897047631744e-06, + "loss": 0.83735085, + "num_input_tokens_seen": 34771275, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.46679688, + "step": 1641, + "time_per_iteration": 3.828230857849121 + }, + { + "auxiliary_loss_clip": 0.01565867, + "auxiliary_loss_mlp": 0.01483856, + "balance_loss_clip": 1.18911648, + "balance_loss_mlp": 1.14282215, + "epoch": 0.19743882642938737, + "flos": 23990483265120.0, + "grad_norm": 2.0868859682488634, + "language_loss": 0.76374686, + "learning_rate": 3.713187892283698e-06, + "loss": 0.79424405, + "num_input_tokens_seen": 34790885, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.4140625, + "step": 1642, + "time_per_iteration": 2.9196953773498535 + }, + { + "auxiliary_loss_clip": 0.01575022, + "auxiliary_loss_mlp": 0.01417048, + "balance_loss_clip": 1.19812977, + "balance_loss_mlp": 1.08116412, + "epoch": 0.19755906932002645, + "flos": 15005667676320.0, + "grad_norm": 2.450729253210168, + "language_loss": 0.87214416, + "learning_rate": 3.71278581991498e-06, + "loss": 0.90206492, + "num_input_tokens_seen": 34806745, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.36132812, + "step": 1643, + "time_per_iteration": 3.0860068798065186 + }, + { + "auxiliary_loss_clip": 0.0157815, + "auxiliary_loss_mlp": 0.01412061, + "balance_loss_clip": 1.20054471, + "balance_loss_mlp": 1.07236242, + "epoch": 0.19767931221066554, + "flos": 19496179062720.0, + "grad_norm": 1.8561300522267632, + "language_loss": 0.79424202, + "learning_rate": 3.712383487718015e-06, + "loss": 0.82414412, + "num_input_tokens_seen": 34824985, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.40039062, + "step": 1644, + "time_per_iteration": 2.997650384902954 + }, + { + "auxiliary_loss_clip": 0.01579855, + "auxiliary_loss_mlp": 0.01395894, + "balance_loss_clip": 1.20145822, + "balance_loss_mlp": 1.06153619, + "epoch": 0.19779955510130465, + "flos": 25740202629600.0, + "grad_norm": 3.1300355543189506, + "language_loss": 0.86830711, + "learning_rate": 3.7119808957538365e-06, + "loss": 0.89806461, + "num_input_tokens_seen": 34843980, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 3.34570312, + "step": 1645, + "time_per_iteration": 3.7572109699249268 + }, + { + "auxiliary_loss_clip": 0.01580791, + "auxiliary_loss_mlp": 0.01416433, + "balance_loss_clip": 1.20502567, + "balance_loss_mlp": 1.08150291, + "epoch": 0.19791979799194373, + "flos": 20779832460960.0, + "grad_norm": 3.438282834332604, + "language_loss": 0.80082023, + "learning_rate": 3.711578044083517e-06, + "loss": 0.83079243, + "num_input_tokens_seen": 34860780, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.3515625, + "step": 1646, + "time_per_iteration": 3.0264744758605957 + }, + { + "auxiliary_loss_clip": 0.01576789, + "auxiliary_loss_mlp": 0.01417789, + "balance_loss_clip": 1.19808114, + "balance_loss_mlp": 1.07999802, + "epoch": 0.1980400408825828, + "flos": 25591143503520.0, + "grad_norm": 1.786811833802689, + "language_loss": 0.74572396, + "learning_rate": 3.7111749327681698e-06, + "loss": 0.77566969, + "num_input_tokens_seen": 34880815, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.38085938, + "step": 1647, + "time_per_iteration": 3.102137327194214 + }, + { + "auxiliary_loss_clip": 0.01584219, + "auxiliary_loss_mlp": 0.01397022, + "balance_loss_clip": 1.20766568, + "balance_loss_mlp": 1.05789542, + "epoch": 0.1981602837732219, + "flos": 23516111031840.0, + "grad_norm": 2.5420527050648536, + "language_loss": 0.86035013, + "learning_rate": 3.7107715618689455e-06, + "loss": 0.89016253, + "num_input_tokens_seen": 34899790, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.39453125, + "step": 1648, + "time_per_iteration": 3.1073994636535645 + }, + { + "auxiliary_loss_clip": 0.01583256, + "auxiliary_loss_mlp": 0.01548278, + "balance_loss_clip": 1.20705283, + "balance_loss_mlp": 1.21811604, + "epoch": 0.198280526663861, + "flos": 23187611959200.0, + "grad_norm": 1.4208703287494993, + "language_loss": 0.83594358, + "learning_rate": 3.710367931447035e-06, + "loss": 0.86725891, + "num_input_tokens_seen": 34921570, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.30273438, + "step": 1649, + "time_per_iteration": 3.067943572998047 + }, + { + "auxiliary_loss_clip": 0.01585554, + "auxiliary_loss_mlp": 0.01420631, + "balance_loss_clip": 1.20857012, + "balance_loss_mlp": 1.07063293, + "epoch": 0.1984007695545001, + "flos": 21691634535360.0, + "grad_norm": 5.539007192556003, + "language_loss": 0.86966634, + "learning_rate": 3.70996404156367e-06, + "loss": 0.89972818, + "num_input_tokens_seen": 34941205, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.50585938, + "step": 1650, + "time_per_iteration": 2.964085817337036 + }, + { + "auxiliary_loss_clip": 0.01589118, + "auxiliary_loss_mlp": 0.01409716, + "balance_loss_clip": 1.21197498, + "balance_loss_mlp": 1.06677485, + "epoch": 0.19852101244513917, + "flos": 36066929999040.0, + "grad_norm": 1.876836175296243, + "language_loss": 0.72861922, + "learning_rate": 3.7095598922801187e-06, + "loss": 0.75860763, + "num_input_tokens_seen": 34963280, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.43359375, + "step": 1651, + "time_per_iteration": 3.1373579502105713 + }, + { + "auxiliary_loss_clip": 0.01578274, + "auxiliary_loss_mlp": 0.01412864, + "balance_loss_clip": 1.20093989, + "balance_loss_mlp": 1.06744313, + "epoch": 0.19864125533577828, + "flos": 23107772453760.0, + "grad_norm": 2.7832155116259707, + "language_loss": 0.76514757, + "learning_rate": 3.7091554836576914e-06, + "loss": 0.79505897, + "num_input_tokens_seen": 34979955, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.45898438, + "step": 1652, + "time_per_iteration": 3.048941135406494 + }, + { + "auxiliary_loss_clip": 0.01584337, + "auxiliary_loss_mlp": 0.01399835, + "balance_loss_clip": 1.20722246, + "balance_loss_mlp": 1.05841947, + "epoch": 0.19876149822641737, + "flos": 24610842443520.0, + "grad_norm": 2.1228907677468563, + "language_loss": 0.82963026, + "learning_rate": 3.708750815757736e-06, + "loss": 0.85947204, + "num_input_tokens_seen": 35000725, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.41796875, + "step": 1653, + "time_per_iteration": 3.014925241470337 + }, + { + "auxiliary_loss_clip": 0.01578672, + "auxiliary_loss_mlp": 0.01394907, + "balance_loss_clip": 1.20241249, + "balance_loss_mlp": 1.05082095, + "epoch": 0.19888174111705645, + "flos": 32199356905920.0, + "grad_norm": 2.6854197570490155, + "language_loss": 0.7333042, + "learning_rate": 3.7083458886416407e-06, + "loss": 0.76304007, + "num_input_tokens_seen": 35019920, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.4453125, + "step": 1654, + "time_per_iteration": 3.1561927795410156 + }, + { + "auxiliary_loss_clip": 0.01583557, + "auxiliary_loss_mlp": 0.01406565, + "balance_loss_clip": 1.20657694, + "balance_loss_mlp": 1.06801033, + "epoch": 0.19900198400769553, + "flos": 24610804515360.0, + "grad_norm": 2.9655975011676428, + "language_loss": 0.87689078, + "learning_rate": 3.707940702370832e-06, + "loss": 0.90679199, + "num_input_tokens_seen": 35040765, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.38867188, + "step": 1655, + "time_per_iteration": 3.040809154510498 + }, + { + "auxiliary_loss_clip": 0.01725044, + "auxiliary_loss_mlp": 0.01374931, + "balance_loss_clip": 1.34056497, + "balance_loss_mlp": 1.07280731, + "epoch": 0.19912222689833464, + "flos": 67922540347680.0, + "grad_norm": 0.7645914891199241, + "language_loss": 0.58199513, + "learning_rate": 3.707535257006777e-06, + "loss": 0.61299491, + "num_input_tokens_seen": 35106390, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 3.015625, + "step": 1656, + "time_per_iteration": 3.5072505474090576 + }, + { + "auxiliary_loss_clip": 0.01589092, + "auxiliary_loss_mlp": 0.01513591, + "balance_loss_clip": 1.21035981, + "balance_loss_mlp": 1.17656255, + "epoch": 0.19924246978897373, + "flos": 15744174166080.0, + "grad_norm": 2.4167723201267775, + "language_loss": 0.88621688, + "learning_rate": 3.707129552610981e-06, + "loss": 0.91724366, + "num_input_tokens_seen": 35125040, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.37304688, + "step": 1657, + "time_per_iteration": 3.0278921127319336 + }, + { + "auxiliary_loss_clip": 0.01586423, + "auxiliary_loss_mlp": 0.01538195, + "balance_loss_clip": 1.2093966, + "balance_loss_mlp": 1.20440936, + "epoch": 0.1993627126796128, + "flos": 17568498949920.0, + "grad_norm": 2.634053332480256, + "language_loss": 0.73930097, + "learning_rate": 3.70672358924499e-06, + "loss": 0.77054721, + "num_input_tokens_seen": 35144280, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.33984375, + "step": 1658, + "time_per_iteration": 3.045984983444214 + }, + { + "auxiliary_loss_clip": 0.01595306, + "auxiliary_loss_mlp": 0.01513286, + "balance_loss_clip": 1.21847761, + "balance_loss_mlp": 1.16653013, + "epoch": 0.19948295557025192, + "flos": 40847025801600.0, + "grad_norm": 2.305387463444057, + "language_loss": 0.7861259, + "learning_rate": 3.706317366970386e-06, + "loss": 0.81721187, + "num_input_tokens_seen": 35165280, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.47070312, + "step": 1659, + "time_per_iteration": 3.189263343811035 + }, + { + "auxiliary_loss_clip": 0.0159641, + "auxiliary_loss_mlp": 0.01519057, + "balance_loss_clip": 1.21951413, + "balance_loss_mlp": 1.17592502, + "epoch": 0.199603198460891, + "flos": 25084759538880.0, + "grad_norm": 2.006159851357903, + "language_loss": 0.83884943, + "learning_rate": 3.705910885848795e-06, + "loss": 0.87000412, + "num_input_tokens_seen": 35183655, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 3.43554688, + "step": 1660, + "time_per_iteration": 3.1102755069732666 + }, + { + "auxiliary_loss_clip": 0.01600511, + "auxiliary_loss_mlp": 0.01495186, + "balance_loss_clip": 1.22574139, + "balance_loss_mlp": 1.14652228, + "epoch": 0.19972344135153008, + "flos": 20086195348800.0, + "grad_norm": 2.1377586003229747, + "language_loss": 0.84524125, + "learning_rate": 3.705504145941879e-06, + "loss": 0.87619823, + "num_input_tokens_seen": 35201825, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.48828125, + "step": 1661, + "time_per_iteration": 2.979733467102051 + }, + { + "auxiliary_loss_clip": 0.01589482, + "auxiliary_loss_mlp": 0.01421954, + "balance_loss_clip": 1.21327555, + "balance_loss_mlp": 1.08587909, + "epoch": 0.1998436842421692, + "flos": 23729459117760.0, + "grad_norm": 2.1427807776229186, + "language_loss": 0.78935206, + "learning_rate": 3.7050971473113403e-06, + "loss": 0.81946641, + "num_input_tokens_seen": 35221600, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.36328125, + "step": 1662, + "time_per_iteration": 3.036444902420044 + }, + { + "auxiliary_loss_clip": 0.01589104, + "auxiliary_loss_mlp": 0.01416829, + "balance_loss_clip": 1.2132895, + "balance_loss_mlp": 1.07903743, + "epoch": 0.19996392713280828, + "flos": 36105389517600.0, + "grad_norm": 1.7546770982788613, + "language_loss": 0.79964924, + "learning_rate": 3.7046898900189196e-06, + "loss": 0.82970858, + "num_input_tokens_seen": 35245935, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.38085938, + "step": 1663, + "time_per_iteration": 3.0727086067199707 + }, + { + "auxiliary_loss_clip": 0.01591343, + "auxiliary_loss_mlp": 0.0143432, + "balance_loss_clip": 1.21524167, + "balance_loss_mlp": 1.09271383, + "epoch": 0.20008417002344736, + "flos": 23659822287360.0, + "grad_norm": 2.678556991738368, + "language_loss": 0.82986552, + "learning_rate": 3.704282374126398e-06, + "loss": 0.86012214, + "num_input_tokens_seen": 35265615, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.41992188, + "step": 1664, + "time_per_iteration": 3.790072441101074 + }, + { + "auxiliary_loss_clip": 0.01592214, + "auxiliary_loss_mlp": 0.0143853, + "balance_loss_clip": 1.21555519, + "balance_loss_mlp": 1.09692419, + "epoch": 0.20020441291408644, + "flos": 21874260447360.0, + "grad_norm": 1.8925782811133895, + "language_loss": 0.8769989, + "learning_rate": 3.7038745996955954e-06, + "loss": 0.90730637, + "num_input_tokens_seen": 35284960, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.41992188, + "step": 1665, + "time_per_iteration": 2.9994184970855713 + }, + { + "auxiliary_loss_clip": 0.01597357, + "auxiliary_loss_mlp": 0.01416716, + "balance_loss_clip": 1.22133756, + "balance_loss_mlp": 1.0766362, + "epoch": 0.20032465580472555, + "flos": 23181619309920.0, + "grad_norm": 2.9910314584798807, + "language_loss": 0.72156644, + "learning_rate": 3.703466566788371e-06, + "loss": 0.7517072, + "num_input_tokens_seen": 35304090, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.40429688, + "step": 1666, + "time_per_iteration": 3.985866069793701 + }, + { + "auxiliary_loss_clip": 0.01593116, + "auxiliary_loss_mlp": 0.01412781, + "balance_loss_clip": 1.21627736, + "balance_loss_mlp": 1.07479858, + "epoch": 0.20044489869536464, + "flos": 23875521919200.0, + "grad_norm": 1.9590953102444215, + "language_loss": 0.74539614, + "learning_rate": 3.703058275466622e-06, + "loss": 0.77545506, + "num_input_tokens_seen": 35323325, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.3828125, + "step": 1667, + "time_per_iteration": 3.059049367904663 + }, + { + "auxiliary_loss_clip": 0.01590041, + "auxiliary_loss_mlp": 0.01405407, + "balance_loss_clip": 1.2138983, + "balance_loss_mlp": 1.0699048, + "epoch": 0.20056514158600372, + "flos": 21947424596640.0, + "grad_norm": 2.343727601932599, + "language_loss": 0.77952904, + "learning_rate": 3.7026497257922877e-06, + "loss": 0.80948353, + "num_input_tokens_seen": 35343635, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.35742188, + "step": 1668, + "time_per_iteration": 3.025106430053711 + }, + { + "auxiliary_loss_clip": 0.01594493, + "auxiliary_loss_mlp": 0.01414865, + "balance_loss_clip": 1.21764708, + "balance_loss_mlp": 1.07878995, + "epoch": 0.20068538447664283, + "flos": 23881476640320.0, + "grad_norm": 1.6902911412662087, + "language_loss": 0.85380667, + "learning_rate": 3.7022409178273436e-06, + "loss": 0.88390028, + "num_input_tokens_seen": 35364615, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.36328125, + "step": 1669, + "time_per_iteration": 3.936441659927368 + }, + { + "auxiliary_loss_clip": 0.01584766, + "auxiliary_loss_mlp": 0.01390208, + "balance_loss_clip": 1.20666945, + "balance_loss_mlp": 1.05012786, + "epoch": 0.2008056273672819, + "flos": 18444837830400.0, + "grad_norm": 2.0114498537255794, + "language_loss": 0.78443968, + "learning_rate": 3.7018318516338054e-06, + "loss": 0.81418943, + "num_input_tokens_seen": 35383775, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.40429688, + "step": 1670, + "time_per_iteration": 2.9865641593933105 + }, + { + "auxiliary_loss_clip": 0.01592452, + "auxiliary_loss_mlp": 0.01407886, + "balance_loss_clip": 1.21819866, + "balance_loss_mlp": 1.06723332, + "epoch": 0.200925870257921, + "flos": 23661794551680.0, + "grad_norm": 3.4411993298024504, + "language_loss": 0.81719351, + "learning_rate": 3.7014225272737284e-06, + "loss": 0.84719694, + "num_input_tokens_seen": 35403000, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.41015625, + "step": 1671, + "time_per_iteration": 3.1315879821777344 + }, + { + "auxiliary_loss_clip": 0.01595706, + "auxiliary_loss_mlp": 0.01417674, + "balance_loss_clip": 1.2191925, + "balance_loss_mlp": 1.07644892, + "epoch": 0.20104611314856008, + "flos": 16219001537280.0, + "grad_norm": 2.752342240892751, + "language_loss": 0.74164081, + "learning_rate": 3.701012944809207e-06, + "loss": 0.77177465, + "num_input_tokens_seen": 35420115, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.41601562, + "step": 1672, + "time_per_iteration": 3.0699949264526367 + }, + { + "auxiliary_loss_clip": 0.01592786, + "auxiliary_loss_mlp": 0.01414153, + "balance_loss_clip": 1.21816552, + "balance_loss_mlp": 1.07578921, + "epoch": 0.2011663560391992, + "flos": 21399508932480.0, + "grad_norm": 3.6859379331841406, + "language_loss": 0.78737718, + "learning_rate": 3.700603104302374e-06, + "loss": 0.81744659, + "num_input_tokens_seen": 35439925, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.38671875, + "step": 1673, + "time_per_iteration": 3.8247461318969727 + }, + { + "auxiliary_loss_clip": 0.01728737, + "auxiliary_loss_mlp": 0.01436108, + "balance_loss_clip": 1.34517634, + "balance_loss_mlp": 1.12482834, + "epoch": 0.20128659892983827, + "flos": 62236179982080.0, + "grad_norm": 0.9310292646387985, + "language_loss": 0.55919904, + "learning_rate": 3.7001930058154027e-06, + "loss": 0.59084755, + "num_input_tokens_seen": 35504885, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 3.109375, + "step": 1674, + "time_per_iteration": 3.5194873809814453 + }, + { + "auxiliary_loss_clip": 0.01598319, + "auxiliary_loss_mlp": 0.01497437, + "balance_loss_clip": 1.22103441, + "balance_loss_mlp": 1.16708422, + "epoch": 0.20140684182047736, + "flos": 28441245575520.0, + "grad_norm": 2.7965587981585496, + "language_loss": 0.80102372, + "learning_rate": 3.6997826494105037e-06, + "loss": 0.83198124, + "num_input_tokens_seen": 35525330, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.3046875, + "step": 1675, + "time_per_iteration": 3.0184996128082275 + }, + { + "auxiliary_loss_clip": 0.01596017, + "auxiliary_loss_mlp": 0.01488196, + "balance_loss_clip": 1.2200253, + "balance_loss_mlp": 1.15002286, + "epoch": 0.20152708471111647, + "flos": 28076562673920.0, + "grad_norm": 2.1357462263369937, + "language_loss": 0.69725031, + "learning_rate": 3.6993720351499286e-06, + "loss": 0.72809243, + "num_input_tokens_seen": 35546455, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.38476562, + "step": 1676, + "time_per_iteration": 3.007141351699829 + }, + { + "auxiliary_loss_clip": 0.01602326, + "auxiliary_loss_mlp": 0.01480028, + "balance_loss_clip": 1.22473073, + "balance_loss_mlp": 1.1426177, + "epoch": 0.20164732760175555, + "flos": 23551991435520.0, + "grad_norm": 1.7822952584055138, + "language_loss": 0.7719928, + "learning_rate": 3.6989611630959666e-06, + "loss": 0.80281633, + "num_input_tokens_seen": 35565010, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.37695312, + "step": 1677, + "time_per_iteration": 3.0799436569213867 + }, + { + "auxiliary_loss_clip": 0.01722467, + "auxiliary_loss_mlp": 0.01492744, + "balance_loss_clip": 1.33901906, + "balance_loss_mlp": 1.170784, + "epoch": 0.20176757049239463, + "flos": 71108120273760.0, + "grad_norm": 0.6995810514868485, + "language_loss": 0.58203077, + "learning_rate": 3.6985500333109474e-06, + "loss": 0.61418295, + "num_input_tokens_seen": 35633340, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 3.21875, + "step": 1678, + "time_per_iteration": 3.4760613441467285 + }, + { + "auxiliary_loss_clip": 0.01598364, + "auxiliary_loss_mlp": 0.0144079, + "balance_loss_clip": 1.21934187, + "balance_loss_mlp": 1.10490561, + "epoch": 0.20188781338303372, + "flos": 21432089586240.0, + "grad_norm": 2.1876829769834294, + "language_loss": 0.76546407, + "learning_rate": 3.6981386458572385e-06, + "loss": 0.79585564, + "num_input_tokens_seen": 35651315, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 3.36132812, + "step": 1679, + "time_per_iteration": 3.0092830657958984 + }, + { + "auxiliary_loss_clip": 0.01591526, + "auxiliary_loss_mlp": 0.01391291, + "balance_loss_clip": 1.21477497, + "balance_loss_mlp": 1.05807686, + "epoch": 0.20200805627367283, + "flos": 11547798575040.0, + "grad_norm": 4.682559457108897, + "language_loss": 0.76561826, + "learning_rate": 3.6977270007972468e-06, + "loss": 0.79544646, + "num_input_tokens_seen": 35668850, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 3.33398438, + "step": 1680, + "time_per_iteration": 2.9982306957244873 + }, + { + "auxiliary_loss_clip": 0.01594459, + "auxiliary_loss_mlp": 0.01381773, + "balance_loss_clip": 1.21668077, + "balance_loss_mlp": 1.04302835, + "epoch": 0.2021282991643119, + "flos": 28547748941760.0, + "grad_norm": 3.846547989051476, + "language_loss": 0.72986889, + "learning_rate": 3.6973150981934196e-06, + "loss": 0.75963116, + "num_input_tokens_seen": 35690080, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.390625, + "step": 1681, + "time_per_iteration": 2.9673640727996826 + }, + { + "auxiliary_loss_clip": 0.01589698, + "auxiliary_loss_mlp": 0.01398417, + "balance_loss_clip": 1.21297526, + "balance_loss_mlp": 1.05662024, + "epoch": 0.202248542054951, + "flos": 17924951440800.0, + "grad_norm": 2.7466007080382293, + "language_loss": 0.83992827, + "learning_rate": 3.6969029381082415e-06, + "loss": 0.86980951, + "num_input_tokens_seen": 35706075, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.421875, + "step": 1682, + "time_per_iteration": 3.044337272644043 + }, + { + "auxiliary_loss_clip": 0.01592086, + "auxiliary_loss_mlp": 0.01399295, + "balance_loss_clip": 1.21402848, + "balance_loss_mlp": 1.0523479, + "epoch": 0.2023687849455901, + "flos": 19866664972800.0, + "grad_norm": 6.044154175548286, + "language_loss": 0.79784286, + "learning_rate": 3.696490520604237e-06, + "loss": 0.82775664, + "num_input_tokens_seen": 35724765, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 3.47460938, + "step": 1683, + "time_per_iteration": 3.1170413494110107 + }, + { + "auxiliary_loss_clip": 0.01590073, + "auxiliary_loss_mlp": 0.01416481, + "balance_loss_clip": 1.21202946, + "balance_loss_mlp": 1.07125092, + "epoch": 0.20248902783622919, + "flos": 22566911427360.0, + "grad_norm": 1.9172134379140138, + "language_loss": 0.80759847, + "learning_rate": 3.696077845743968e-06, + "loss": 0.83766401, + "num_input_tokens_seen": 35744355, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.45703125, + "step": 1684, + "time_per_iteration": 3.15079665184021 + }, + { + "auxiliary_loss_clip": 0.01582457, + "auxiliary_loss_mlp": 0.0141964, + "balance_loss_clip": 1.20378137, + "balance_loss_mlp": 1.07517242, + "epoch": 0.20260927072686827, + "flos": 22711874312160.0, + "grad_norm": 3.550892974462115, + "language_loss": 0.73942411, + "learning_rate": 3.69566491359004e-06, + "loss": 0.76944506, + "num_input_tokens_seen": 35761000, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.44921875, + "step": 1685, + "time_per_iteration": 3.089540481567383 + }, + { + "auxiliary_loss_clip": 0.01594837, + "auxiliary_loss_mlp": 0.01414049, + "balance_loss_clip": 1.21634865, + "balance_loss_mlp": 1.06729281, + "epoch": 0.20272951361750738, + "flos": 51027804154080.0, + "grad_norm": 2.1023108579099485, + "language_loss": 0.69696724, + "learning_rate": 3.695251724205092e-06, + "loss": 0.72705615, + "num_input_tokens_seen": 35785360, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.47265625, + "step": 1686, + "time_per_iteration": 3.259136199951172 + }, + { + "auxiliary_loss_clip": 0.01587193, + "auxiliary_loss_mlp": 0.0141692, + "balance_loss_clip": 1.20789862, + "balance_loss_mlp": 1.07416964, + "epoch": 0.20284975650814646, + "flos": 26580547321920.0, + "grad_norm": 1.8834211077892593, + "language_loss": 0.8649888, + "learning_rate": 3.6948382776518054e-06, + "loss": 0.89503002, + "num_input_tokens_seen": 35806065, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 3.43164062, + "step": 1687, + "time_per_iteration": 3.1228268146514893 + }, + { + "auxiliary_loss_clip": 0.01590786, + "auxiliary_loss_mlp": 0.01440503, + "balance_loss_clip": 1.21158028, + "balance_loss_mlp": 1.0929842, + "epoch": 0.20296999939878554, + "flos": 16036072200000.0, + "grad_norm": 3.1736553725619245, + "language_loss": 0.79364157, + "learning_rate": 3.6944245739929e-06, + "loss": 0.8239544, + "num_input_tokens_seen": 35822225, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.48046875, + "step": 1688, + "time_per_iteration": 2.9911091327667236 + }, + { + "auxiliary_loss_clip": 0.015919, + "auxiliary_loss_mlp": 0.01419992, + "balance_loss_clip": 1.21146119, + "balance_loss_mlp": 1.07209206, + "epoch": 0.20309024228942463, + "flos": 19205115448320.0, + "grad_norm": 3.209258773479175, + "language_loss": 0.72174376, + "learning_rate": 3.6940106132911332e-06, + "loss": 0.75186265, + "num_input_tokens_seen": 35839410, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.484375, + "step": 1689, + "time_per_iteration": 2.9708950519561768 + }, + { + "auxiliary_loss_clip": 0.01584046, + "auxiliary_loss_mlp": 0.01416789, + "balance_loss_clip": 1.20395398, + "balance_loss_mlp": 1.07175004, + "epoch": 0.20321048518006374, + "flos": 22823346267360.0, + "grad_norm": 2.173970788232496, + "language_loss": 0.89038789, + "learning_rate": 3.6935963956093037e-06, + "loss": 0.92039621, + "num_input_tokens_seen": 35859495, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 3.45507812, + "step": 1690, + "time_per_iteration": 3.0601251125335693 + }, + { + "auxiliary_loss_clip": 0.01588841, + "auxiliary_loss_mlp": 0.01420152, + "balance_loss_clip": 1.20684481, + "balance_loss_mlp": 1.07797408, + "epoch": 0.20333072807070282, + "flos": 19098270728640.0, + "grad_norm": 1.866279207569556, + "language_loss": 0.68581939, + "learning_rate": 3.6931819210102474e-06, + "loss": 0.71590936, + "num_input_tokens_seen": 35878890, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 3.42578125, + "step": 1691, + "time_per_iteration": 3.837217092514038 + }, + { + "auxiliary_loss_clip": 0.01583695, + "auxiliary_loss_mlp": 0.01404502, + "balance_loss_clip": 1.20321417, + "balance_loss_mlp": 1.06041598, + "epoch": 0.2034509709613419, + "flos": 18182599981920.0, + "grad_norm": 1.8013479364264084, + "language_loss": 0.84620929, + "learning_rate": 3.6927671895568402e-06, + "loss": 0.87609124, + "num_input_tokens_seen": 35897950, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.4453125, + "step": 1692, + "time_per_iteration": 3.0104198455810547 + }, + { + "auxiliary_loss_clip": 0.01588223, + "auxiliary_loss_mlp": 0.01419888, + "balance_loss_clip": 1.20875192, + "balance_loss_mlp": 1.08514786, + "epoch": 0.20357121385198101, + "flos": 22925791320480.0, + "grad_norm": 1.9774381675337183, + "language_loss": 0.86995065, + "learning_rate": 3.692352201311996e-06, + "loss": 0.90003169, + "num_input_tokens_seen": 35916800, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 3.34960938, + "step": 1693, + "time_per_iteration": 3.1410999298095703 + }, + { + "auxiliary_loss_clip": 0.01583144, + "auxiliary_loss_mlp": 0.01474559, + "balance_loss_clip": 1.20155811, + "balance_loss_mlp": 1.13638616, + "epoch": 0.2036914567426201, + "flos": 20924188495200.0, + "grad_norm": 2.3138074911745634, + "language_loss": 0.76874578, + "learning_rate": 3.6919369563386687e-06, + "loss": 0.79932284, + "num_input_tokens_seen": 35936600, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 3.38476562, + "step": 1694, + "time_per_iteration": 3.930546760559082 + }, + { + "auxiliary_loss_clip": 0.01583731, + "auxiliary_loss_mlp": 0.01494427, + "balance_loss_clip": 1.20266426, + "balance_loss_mlp": 1.15816164, + "epoch": 0.20381169963325918, + "flos": 15520737189600.0, + "grad_norm": 2.4927438631222407, + "language_loss": 0.79131371, + "learning_rate": 3.69152145469985e-06, + "loss": 0.82209527, + "num_input_tokens_seen": 35953645, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 3.36328125, + "step": 1695, + "time_per_iteration": 2.9809865951538086 + }, + { + "auxiliary_loss_clip": 0.01583742, + "auxiliary_loss_mlp": 0.0149985, + "balance_loss_clip": 1.20188713, + "balance_loss_mlp": 1.16911626, + "epoch": 0.20393194252389826, + "flos": 28835285237280.0, + "grad_norm": 2.275126679260828, + "language_loss": 0.8246001, + "learning_rate": 3.691105696458572e-06, + "loss": 0.85543597, + "num_input_tokens_seen": 35970940, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 3.30859375, + "step": 1696, + "time_per_iteration": 4.040812253952026 + }, + { + "auxiliary_loss_clip": 0.01583866, + "auxiliary_loss_mlp": 0.0149184, + "balance_loss_clip": 1.20152783, + "balance_loss_mlp": 1.15748167, + "epoch": 0.20405218541453737, + "flos": 22490561312640.0, + "grad_norm": 2.7066797926431603, + "language_loss": 0.68035042, + "learning_rate": 3.690689681677904e-06, + "loss": 0.71110749, + "num_input_tokens_seen": 35989410, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 3.34570312, + "step": 1697, + "time_per_iteration": 2.9782121181488037 + }, + { + "auxiliary_loss_clip": 0.0158007, + "auxiliary_loss_mlp": 0.01480576, + "balance_loss_clip": 1.19862485, + "balance_loss_mlp": 1.14831567, + "epoch": 0.20417242830517646, + "flos": 25376771357280.0, + "grad_norm": 1.891033422231206, + "language_loss": 0.88905787, + "learning_rate": 3.690273410420956e-06, + "loss": 0.91966432, + "num_input_tokens_seen": 36009175, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.32421875, + "step": 1698, + "time_per_iteration": 3.0226924419403076 + }, + { + "auxiliary_loss_clip": 0.01580052, + "auxiliary_loss_mlp": 0.01421874, + "balance_loss_clip": 1.19922173, + "balance_loss_mlp": 1.0850358, + "epoch": 0.20429267119581554, + "flos": 14794822848960.0, + "grad_norm": 8.518589053724801, + "language_loss": 0.76637292, + "learning_rate": 3.689856882750875e-06, + "loss": 0.7963922, + "num_input_tokens_seen": 36024375, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 3.37109375, + "step": 1699, + "time_per_iteration": 3.0219874382019043 + }, + { + "auxiliary_loss_clip": 0.01582853, + "auxiliary_loss_mlp": 0.01401156, + "balance_loss_clip": 1.20007324, + "balance_loss_mlp": 1.06584418, + "epoch": 0.20441291408645465, + "flos": 17783591731200.0, + "grad_norm": 2.1294482966840436, + "language_loss": 0.78615451, + "learning_rate": 3.6894400987308486e-06, + "loss": 0.81599462, + "num_input_tokens_seen": 36041895, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 3.35546875, + "step": 1700, + "time_per_iteration": 3.85428524017334 + }, + { + "auxiliary_loss_clip": 0.01574843, + "auxiliary_loss_mlp": 0.0140224, + "balance_loss_clip": 1.19442952, + "balance_loss_mlp": 1.07017064, + "epoch": 0.20453315697709373, + "flos": 16437545781120.0, + "grad_norm": 3.2526228405037294, + "language_loss": 0.85135829, + "learning_rate": 3.6890230584241024e-06, + "loss": 0.88112915, + "num_input_tokens_seen": 36058825, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 3.32226562, + "step": 1701, + "time_per_iteration": 2.9559645652770996 + }, + { + "auxiliary_loss_clip": 0.01727107, + "auxiliary_loss_mlp": 0.01417457, + "balance_loss_clip": 1.33809233, + "balance_loss_mlp": 1.11533356, + "epoch": 0.20465339986773282, + "flos": 66719826371520.0, + "grad_norm": 1.1635809444894032, + "language_loss": 0.6636197, + "learning_rate": 3.6886057618939016e-06, + "loss": 0.69506532, + "num_input_tokens_seen": 36121645, + "router_z_loss_clip": 3.890625, + "router_z_loss_mlp": 3.015625, + "step": 1702, + "time_per_iteration": 3.486690044403076 + }, + { + "auxiliary_loss_clip": 0.01585441, + "auxiliary_loss_mlp": 0.01382653, + "balance_loss_clip": 1.2028625, + "balance_loss_mlp": 1.04791331, + "epoch": 0.2047736427583719, + "flos": 41977334191680.0, + "grad_norm": 2.816172062712823, + "language_loss": 0.69737822, + "learning_rate": 3.6881882092035492e-06, + "loss": 0.72705919, + "num_input_tokens_seen": 36143030, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 3.34960938, + "step": 1703, + "time_per_iteration": 3.2013349533081055 + }, + { + "auxiliary_loss_clip": 0.0172287, + "auxiliary_loss_mlp": 0.01365524, + "balance_loss_clip": 1.33422351, + "balance_loss_mlp": 1.0504303, + "epoch": 0.204893885649011, + "flos": 69946710792480.0, + "grad_norm": 0.9420654148644211, + "language_loss": 0.61212838, + "learning_rate": 3.6877704004163873e-06, + "loss": 0.64301229, + "num_input_tokens_seen": 36203435, + "router_z_loss_clip": 3.890625, + "router_z_loss_mlp": 3.1484375, + "step": 1704, + "time_per_iteration": 3.522890329360962 + }, + { + "auxiliary_loss_clip": 0.01578521, + "auxiliary_loss_mlp": 0.01414268, + "balance_loss_clip": 1.19709051, + "balance_loss_mlp": 1.07266212, + "epoch": 0.2050141285396501, + "flos": 22202190597600.0, + "grad_norm": 1.9305788882398547, + "language_loss": 0.78329849, + "learning_rate": 3.6873523355957984e-06, + "loss": 0.8132264, + "num_input_tokens_seen": 36222435, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 3.41992188, + "step": 1705, + "time_per_iteration": 3.165153980255127 + }, + { + "auxiliary_loss_clip": 0.01719178, + "auxiliary_loss_mlp": 0.01469803, + "balance_loss_clip": 1.3303895, + "balance_loss_mlp": 1.1806488, + "epoch": 0.20513437143028918, + "flos": 46289467255680.0, + "grad_norm": 1.0881677623759731, + "language_loss": 0.64122295, + "learning_rate": 3.686934014805201e-06, + "loss": 0.67311281, + "num_input_tokens_seen": 36273065, + "router_z_loss_clip": 3.890625, + "router_z_loss_mlp": 2.8984375, + "step": 1706, + "time_per_iteration": 3.3061060905456543 + }, + { + "auxiliary_loss_clip": 0.01577456, + "auxiliary_loss_mlp": 0.01496851, + "balance_loss_clip": 1.19591928, + "balance_loss_mlp": 1.16153884, + "epoch": 0.20525461432092829, + "flos": 21906272178720.0, + "grad_norm": 2.0036293793904, + "language_loss": 0.81422836, + "learning_rate": 3.6865154381080552e-06, + "loss": 0.84497142, + "num_input_tokens_seen": 36293750, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 3.35546875, + "step": 1707, + "time_per_iteration": 3.0508980751037598 + }, + { + "auxiliary_loss_clip": 0.01570598, + "auxiliary_loss_mlp": 0.01408122, + "balance_loss_clip": 1.18940353, + "balance_loss_mlp": 1.0710938, + "epoch": 0.20537485721156737, + "flos": 21216352026240.0, + "grad_norm": 3.945280262647854, + "language_loss": 0.82797635, + "learning_rate": 3.6860966055678585e-06, + "loss": 0.85776353, + "num_input_tokens_seen": 36310105, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 3.37304688, + "step": 1708, + "time_per_iteration": 3.02644419670105 + }, + { + "auxiliary_loss_clip": 0.01580316, + "auxiliary_loss_mlp": 0.01382007, + "balance_loss_clip": 1.19902635, + "balance_loss_mlp": 1.04612315, + "epoch": 0.20549510010220645, + "flos": 20193153852960.0, + "grad_norm": 1.7190857595810858, + "language_loss": 0.8667649, + "learning_rate": 3.685677517248147e-06, + "loss": 0.89638811, + "num_input_tokens_seen": 36328995, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 3.36132812, + "step": 1709, + "time_per_iteration": 3.0081961154937744 + }, + { + "auxiliary_loss_clip": 0.0158768, + "auxiliary_loss_mlp": 0.01380218, + "balance_loss_clip": 1.20518923, + "balance_loss_mlp": 1.04719472, + "epoch": 0.20561534299284553, + "flos": 17018459308800.0, + "grad_norm": 2.097967559783381, + "language_loss": 0.80558735, + "learning_rate": 3.6852581732124967e-06, + "loss": 0.83526635, + "num_input_tokens_seen": 36346340, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 3.33203125, + "step": 1710, + "time_per_iteration": 3.0874736309051514 + }, + { + "auxiliary_loss_clip": 0.01575703, + "auxiliary_loss_mlp": 0.0139617, + "balance_loss_clip": 1.19398224, + "balance_loss_mlp": 1.06200218, + "epoch": 0.20573558588348465, + "flos": 22892527959840.0, + "grad_norm": 2.0490486697343133, + "language_loss": 0.76613641, + "learning_rate": 3.6848385735245213e-06, + "loss": 0.79585505, + "num_input_tokens_seen": 36365430, + "router_z_loss_clip": 3.81835938, + "router_z_loss_mlp": 3.34375, + "step": 1711, + "time_per_iteration": 3.032247543334961 + }, + { + "auxiliary_loss_clip": 0.01565114, + "auxiliary_loss_mlp": 0.01370924, + "balance_loss_clip": 1.18233395, + "balance_loss_mlp": 1.0394268, + "epoch": 0.20585582877412373, + "flos": 24645357433440.0, + "grad_norm": 1.86849213238156, + "language_loss": 0.86777341, + "learning_rate": 3.6844187182478734e-06, + "loss": 0.89713383, + "num_input_tokens_seen": 36386285, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 3.31640625, + "step": 1712, + "time_per_iteration": 3.0798351764678955 + }, + { + "auxiliary_loss_clip": 0.01571639, + "auxiliary_loss_mlp": 0.0138108, + "balance_loss_clip": 1.19029975, + "balance_loss_mlp": 1.04462409, + "epoch": 0.2059760716647628, + "flos": 24209255077920.0, + "grad_norm": 2.292618394790154, + "language_loss": 0.7521928, + "learning_rate": 3.683998607446246e-06, + "loss": 0.78171998, + "num_input_tokens_seen": 36404935, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 3.3671875, + "step": 1713, + "time_per_iteration": 3.059025764465332 + }, + { + "auxiliary_loss_clip": 0.01571683, + "auxiliary_loss_mlp": 0.01370042, + "balance_loss_clip": 1.19099414, + "balance_loss_mlp": 1.03778195, + "epoch": 0.20609631455540192, + "flos": 20231575443360.0, + "grad_norm": 2.639730230459479, + "language_loss": 0.7527867, + "learning_rate": 3.6835782411833686e-06, + "loss": 0.78220391, + "num_input_tokens_seen": 36424455, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 3.32421875, + "step": 1714, + "time_per_iteration": 3.0061745643615723 + }, + { + "auxiliary_loss_clip": 0.01564505, + "auxiliary_loss_mlp": 0.01384533, + "balance_loss_clip": 1.18126893, + "balance_loss_mlp": 1.05074728, + "epoch": 0.206216557446041, + "flos": 19867006326240.0, + "grad_norm": 1.7784855976533271, + "language_loss": 0.74279648, + "learning_rate": 3.68315761952301e-06, + "loss": 0.77228677, + "num_input_tokens_seen": 36441685, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 3.33984375, + "step": 1715, + "time_per_iteration": 3.2102417945861816 + }, + { + "auxiliary_loss_clip": 0.01570311, + "auxiliary_loss_mlp": 0.01391322, + "balance_loss_clip": 1.18908823, + "balance_loss_mlp": 1.06039739, + "epoch": 0.2063368003366801, + "flos": 24098541685920.0, + "grad_norm": 2.290230235146808, + "language_loss": 0.83220565, + "learning_rate": 3.6827367425289797e-06, + "loss": 0.86182201, + "num_input_tokens_seen": 36461460, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.31054688, + "step": 1716, + "time_per_iteration": 3.0489320755004883 + }, + { + "auxiliary_loss_clip": 0.01564453, + "auxiliary_loss_mlp": 0.01598359, + "balance_loss_clip": 1.18299377, + "balance_loss_mlp": 1.27830601, + "epoch": 0.2064570432273192, + "flos": 20342288835360.0, + "grad_norm": 2.7619994893507123, + "language_loss": 0.72445834, + "learning_rate": 3.6823156102651225e-06, + "loss": 0.75608647, + "num_input_tokens_seen": 36479615, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.19921875, + "step": 1717, + "time_per_iteration": 2.9542956352233887 + }, + { + "auxiliary_loss_clip": 0.01570027, + "auxiliary_loss_mlp": 0.01607372, + "balance_loss_clip": 1.18878353, + "balance_loss_mlp": 1.28865409, + "epoch": 0.20657728611795828, + "flos": 20522373560640.0, + "grad_norm": 2.4468282230452294, + "language_loss": 0.71302056, + "learning_rate": 3.6818942227953257e-06, + "loss": 0.74479455, + "num_input_tokens_seen": 36500160, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.18554688, + "step": 1718, + "time_per_iteration": 3.0933029651641846 + }, + { + "auxiliary_loss_clip": 0.01577855, + "auxiliary_loss_mlp": 0.01581604, + "balance_loss_clip": 1.19556975, + "balance_loss_mlp": 1.25849879, + "epoch": 0.20669752900859736, + "flos": 21801475579680.0, + "grad_norm": 2.0364757903824646, + "language_loss": 0.68750989, + "learning_rate": 3.681472580183512e-06, + "loss": 0.71910453, + "num_input_tokens_seen": 36518810, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 3.23046875, + "step": 1719, + "time_per_iteration": 3.8012588024139404 + }, + { + "auxiliary_loss_clip": 0.01569674, + "auxiliary_loss_mlp": 0.01619785, + "balance_loss_clip": 1.18785453, + "balance_loss_mlp": 1.30736125, + "epoch": 0.20681777189923645, + "flos": 15123814987680.0, + "grad_norm": 1.894559105639221, + "language_loss": 0.86307025, + "learning_rate": 3.6810506824936455e-06, + "loss": 0.89496487, + "num_input_tokens_seen": 36536890, + "router_z_loss_clip": 3.81835938, + "router_z_loss_mlp": 3.12109375, + "step": 1720, + "time_per_iteration": 3.0589287281036377 + }, + { + "auxiliary_loss_clip": 0.01698326, + "auxiliary_loss_mlp": 0.01447205, + "balance_loss_clip": 1.30859077, + "balance_loss_mlp": 1.16186523, + "epoch": 0.20693801478987556, + "flos": 56486289584160.0, + "grad_norm": 1.1062004830286931, + "language_loss": 0.62526226, + "learning_rate": 3.680628529789726e-06, + "loss": 0.65671754, + "num_input_tokens_seen": 36589300, + "router_z_loss_clip": 3.890625, + "router_z_loss_mlp": 2.859375, + "step": 1721, + "time_per_iteration": 4.193534851074219 + }, + { + "auxiliary_loss_clip": 0.01568301, + "auxiliary_loss_mlp": 0.01525105, + "balance_loss_clip": 1.18681335, + "balance_loss_mlp": 1.18845809, + "epoch": 0.20705825768051464, + "flos": 21616232624640.0, + "grad_norm": 2.1453511178890827, + "language_loss": 0.86448681, + "learning_rate": 3.680206122135796e-06, + "loss": 0.89542091, + "num_input_tokens_seen": 36609905, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.36914062, + "step": 1722, + "time_per_iteration": 3.0895886421203613 + }, + { + "auxiliary_loss_clip": 0.01568567, + "auxiliary_loss_mlp": 0.01494957, + "balance_loss_clip": 1.18574142, + "balance_loss_mlp": 1.1565932, + "epoch": 0.20717850057115372, + "flos": 25851295303200.0, + "grad_norm": 1.8380311033501375, + "language_loss": 0.78553474, + "learning_rate": 3.6797834595959323e-06, + "loss": 0.81616992, + "num_input_tokens_seen": 36629805, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 3.38671875, + "step": 1723, + "time_per_iteration": 3.0524330139160156 + }, + { + "auxiliary_loss_clip": 0.01563885, + "auxiliary_loss_mlp": 0.01495191, + "balance_loss_clip": 1.18082595, + "balance_loss_mlp": 1.14977002, + "epoch": 0.20729874346179283, + "flos": 29133289704960.0, + "grad_norm": 3.058218635376258, + "language_loss": 0.78006715, + "learning_rate": 3.679360542234254e-06, + "loss": 0.81065786, + "num_input_tokens_seen": 36649150, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 3.45507812, + "step": 1724, + "time_per_iteration": 3.993759870529175 + }, + { + "auxiliary_loss_clip": 0.0157161, + "auxiliary_loss_mlp": 0.01484346, + "balance_loss_clip": 1.1895926, + "balance_loss_mlp": 1.13434768, + "epoch": 0.20741898635243192, + "flos": 29025951919200.0, + "grad_norm": 1.7261930763646995, + "language_loss": 0.72442609, + "learning_rate": 3.678937370114916e-06, + "loss": 0.75498569, + "num_input_tokens_seen": 36668955, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 3.50390625, + "step": 1725, + "time_per_iteration": 3.314741373062134 + }, + { + "auxiliary_loss_clip": 0.0156446, + "auxiliary_loss_mlp": 0.01460081, + "balance_loss_clip": 1.18162322, + "balance_loss_mlp": 1.10931933, + "epoch": 0.207539229243071, + "flos": 15561244828800.0, + "grad_norm": 2.024848170446247, + "language_loss": 0.7895171, + "learning_rate": 3.678513943302114e-06, + "loss": 0.81976253, + "num_input_tokens_seen": 36685730, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 3.50976562, + "step": 1726, + "time_per_iteration": 3.046771287918091 + }, + { + "auxiliary_loss_clip": 0.01571003, + "auxiliary_loss_mlp": 0.01455835, + "balance_loss_clip": 1.18833947, + "balance_loss_mlp": 1.09134102, + "epoch": 0.20765947213371008, + "flos": 20523132123840.0, + "grad_norm": 3.18971925592184, + "language_loss": 0.84966612, + "learning_rate": 3.678090261860082e-06, + "loss": 0.87993449, + "num_input_tokens_seen": 36705460, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 3.64257812, + "step": 1727, + "time_per_iteration": 3.143146514892578 + }, + { + "auxiliary_loss_clip": 0.0155524, + "auxiliary_loss_mlp": 0.01455518, + "balance_loss_clip": 1.17315221, + "balance_loss_mlp": 1.08663714, + "epoch": 0.2077797150243492, + "flos": 19356450264000.0, + "grad_norm": 2.0487877105192216, + "language_loss": 0.7807979, + "learning_rate": 3.6776663258530906e-06, + "loss": 0.81090546, + "num_input_tokens_seen": 36724110, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 3.6875, + "step": 1728, + "time_per_iteration": 3.846889019012451 + }, + { + "auxiliary_loss_clip": 0.01565304, + "auxiliary_loss_mlp": 0.01453717, + "balance_loss_clip": 1.18321633, + "balance_loss_mlp": 1.08140302, + "epoch": 0.20789995791498828, + "flos": 21831818472000.0, + "grad_norm": 2.3376108707893235, + "language_loss": 0.71371615, + "learning_rate": 3.6772421353454516e-06, + "loss": 0.74390632, + "num_input_tokens_seen": 36742705, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 3.71875, + "step": 1729, + "time_per_iteration": 3.107286214828491 + }, + { + "auxiliary_loss_clip": 0.01562103, + "auxiliary_loss_mlp": 0.01461334, + "balance_loss_clip": 1.18051314, + "balance_loss_mlp": 1.08577728, + "epoch": 0.20802020080562736, + "flos": 23151086776800.0, + "grad_norm": 2.2465201370643912, + "language_loss": 0.88462335, + "learning_rate": 3.6768176904015153e-06, + "loss": 0.91485775, + "num_input_tokens_seen": 36762510, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.75390625, + "step": 1730, + "time_per_iteration": 3.0669093132019043 + }, + { + "auxiliary_loss_clip": 0.01559217, + "auxiliary_loss_mlp": 0.01446971, + "balance_loss_clip": 1.17733753, + "balance_loss_mlp": 1.07580066, + "epoch": 0.20814044369626647, + "flos": 23074736662080.0, + "grad_norm": 2.306714772237506, + "language_loss": 0.60369146, + "learning_rate": 3.6763929910856674e-06, + "loss": 0.6337533, + "num_input_tokens_seen": 36780960, + "router_z_loss_clip": 3.81835938, + "router_z_loss_mlp": 3.7109375, + "step": 1731, + "time_per_iteration": 3.1063101291656494 + }, + { + "auxiliary_loss_clip": 0.01570403, + "auxiliary_loss_mlp": 0.01460931, + "balance_loss_clip": 1.18803287, + "balance_loss_mlp": 1.08384824, + "epoch": 0.20826068658690555, + "flos": 19609964635680.0, + "grad_norm": 3.1265461076683283, + "language_loss": 0.78237247, + "learning_rate": 3.6759680374623365e-06, + "loss": 0.81268585, + "num_input_tokens_seen": 36798875, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 3.76953125, + "step": 1732, + "time_per_iteration": 2.952242374420166 + }, + { + "auxiliary_loss_clip": 0.01570182, + "auxiliary_loss_mlp": 0.01448758, + "balance_loss_clip": 1.18540597, + "balance_loss_mlp": 1.07358241, + "epoch": 0.20838092947754464, + "flos": 25376392075680.0, + "grad_norm": 2.314120845580532, + "language_loss": 0.75677025, + "learning_rate": 3.675542829595986e-06, + "loss": 0.78695965, + "num_input_tokens_seen": 36818540, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 3.74804688, + "step": 1733, + "time_per_iteration": 3.0781710147857666 + }, + { + "auxiliary_loss_clip": 0.01558914, + "auxiliary_loss_mlp": 0.01447885, + "balance_loss_clip": 1.17501748, + "balance_loss_mlp": 1.07023048, + "epoch": 0.20850117236818372, + "flos": 24064064624160.0, + "grad_norm": 1.812780287719859, + "language_loss": 0.79559553, + "learning_rate": 3.6751173675511213e-06, + "loss": 0.82566357, + "num_input_tokens_seen": 36840585, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 3.77148438, + "step": 1734, + "time_per_iteration": 3.022817373275757 + }, + { + "auxiliary_loss_clip": 0.01561261, + "auxiliary_loss_mlp": 0.01442273, + "balance_loss_clip": 1.17556715, + "balance_loss_mlp": 1.06404614, + "epoch": 0.20862141525882283, + "flos": 20079519992640.0, + "grad_norm": 2.417151064208919, + "language_loss": 0.87640458, + "learning_rate": 3.674691651392283e-06, + "loss": 0.9064399, + "num_input_tokens_seen": 36858255, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 3.77929688, + "step": 1735, + "time_per_iteration": 2.985265016555786 + }, + { + "auxiliary_loss_clip": 0.01560571, + "auxiliary_loss_mlp": 0.01442765, + "balance_loss_clip": 1.177279, + "balance_loss_mlp": 1.05500078, + "epoch": 0.2087416581494619, + "flos": 39018035854080.0, + "grad_norm": 2.2050530608242127, + "language_loss": 0.76035291, + "learning_rate": 3.674265681184053e-06, + "loss": 0.79038626, + "num_input_tokens_seen": 36881515, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 3.875, + "step": 1736, + "time_per_iteration": 3.169422149658203 + }, + { + "auxiliary_loss_clip": 0.01562236, + "auxiliary_loss_mlp": 0.01448566, + "balance_loss_clip": 1.17760897, + "balance_loss_mlp": 1.06690574, + "epoch": 0.208861901040101, + "flos": 26104278680640.0, + "grad_norm": 1.9200071080288936, + "language_loss": 0.86592329, + "learning_rate": 3.6738394569910504e-06, + "loss": 0.89603126, + "num_input_tokens_seen": 36902055, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 3.81054688, + "step": 1737, + "time_per_iteration": 2.993521213531494 + }, + { + "auxiliary_loss_clip": 0.01571676, + "auxiliary_loss_mlp": 0.0143273, + "balance_loss_clip": 1.18715453, + "balance_loss_mlp": 1.06613779, + "epoch": 0.2089821439307401, + "flos": 28401041361600.0, + "grad_norm": 2.4514025836635844, + "language_loss": 0.82750589, + "learning_rate": 3.6734129788779333e-06, + "loss": 0.85754991, + "num_input_tokens_seen": 36921230, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 3.6640625, + "step": 1738, + "time_per_iteration": 3.1636338233947754 + }, + { + "auxiliary_loss_clip": 0.01567016, + "auxiliary_loss_mlp": 0.01430648, + "balance_loss_clip": 1.18169618, + "balance_loss_mlp": 1.05871511, + "epoch": 0.2091023868213792, + "flos": 21071768423040.0, + "grad_norm": 4.89195205496731, + "language_loss": 0.90414894, + "learning_rate": 3.6729862469093976e-06, + "loss": 0.9341256, + "num_input_tokens_seen": 36940325, + "router_z_loss_clip": 3.85351562, + "router_z_loss_mlp": 3.71679688, + "step": 1739, + "time_per_iteration": 3.0899972915649414 + }, + { + "auxiliary_loss_clip": 0.0157025, + "auxiliary_loss_mlp": 0.01421736, + "balance_loss_clip": 1.18406844, + "balance_loss_mlp": 1.05171013, + "epoch": 0.20922262971201827, + "flos": 22457677233600.0, + "grad_norm": 2.716455879652805, + "language_loss": 0.83365017, + "learning_rate": 3.6725592611501782e-06, + "loss": 0.86357003, + "num_input_tokens_seen": 36959000, + "router_z_loss_clip": 3.86132812, + "router_z_loss_mlp": 3.69726562, + "step": 1740, + "time_per_iteration": 2.9886860847473145 + }, + { + "auxiliary_loss_clip": 0.01564464, + "auxiliary_loss_mlp": 0.01428744, + "balance_loss_clip": 1.17898154, + "balance_loss_mlp": 1.06920934, + "epoch": 0.20934287260265738, + "flos": 27854566967520.0, + "grad_norm": 2.354575571731932, + "language_loss": 0.76626933, + "learning_rate": 3.6721320216650496e-06, + "loss": 0.79620141, + "num_input_tokens_seen": 36979615, + "router_z_loss_clip": 3.85546875, + "router_z_loss_mlp": 3.59570312, + "step": 1741, + "time_per_iteration": 3.09436297416687 + }, + { + "auxiliary_loss_clip": 0.01567199, + "auxiliary_loss_mlp": 0.01419109, + "balance_loss_clip": 1.18252599, + "balance_loss_mlp": 1.05652201, + "epoch": 0.20946311549329646, + "flos": 16437166499520.0, + "grad_norm": 1.8248401902046512, + "language_loss": 0.83724988, + "learning_rate": 3.6717045285188215e-06, + "loss": 0.86711299, + "num_input_tokens_seen": 36997310, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 3.625, + "step": 1742, + "time_per_iteration": 3.012733221054077 + }, + { + "auxiliary_loss_clip": 0.01567233, + "auxiliary_loss_mlp": 0.01417837, + "balance_loss_clip": 1.18080056, + "balance_loss_mlp": 1.05715799, + "epoch": 0.20958335838393555, + "flos": 22494960979200.0, + "grad_norm": 1.88268613278099, + "language_loss": 0.86725664, + "learning_rate": 3.671276781776346e-06, + "loss": 0.89710736, + "num_input_tokens_seen": 37015965, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 3.60742188, + "step": 1743, + "time_per_iteration": 3.106337547302246 + }, + { + "auxiliary_loss_clip": 0.01568566, + "auxiliary_loss_mlp": 0.01421225, + "balance_loss_clip": 1.18260646, + "balance_loss_mlp": 1.05348825, + "epoch": 0.20970360127457463, + "flos": 25226915739840.0, + "grad_norm": 2.057229130568692, + "language_loss": 0.67626208, + "learning_rate": 3.6708487815025128e-06, + "loss": 0.70615995, + "num_input_tokens_seen": 37036545, + "router_z_loss_clip": 3.86328125, + "router_z_loss_mlp": 3.671875, + "step": 1744, + "time_per_iteration": 3.168560028076172 + }, + { + "auxiliary_loss_clip": 0.01565448, + "auxiliary_loss_mlp": 0.01417107, + "balance_loss_clip": 1.17987418, + "balance_loss_mlp": 1.05471063, + "epoch": 0.20982384416521374, + "flos": 18481173372000.0, + "grad_norm": 2.348544603924231, + "language_loss": 0.74669069, + "learning_rate": 3.6704205277622463e-06, + "loss": 0.7765162, + "num_input_tokens_seen": 37054985, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 3.62304688, + "step": 1745, + "time_per_iteration": 3.0387940406799316 + }, + { + "auxiliary_loss_clip": 0.01566932, + "auxiliary_loss_mlp": 0.0142743, + "balance_loss_clip": 1.18176389, + "balance_loss_mlp": 1.07590532, + "epoch": 0.20994408705585282, + "flos": 25375861081440.0, + "grad_norm": 1.9504815595170077, + "language_loss": 0.80623901, + "learning_rate": 3.6699920206205146e-06, + "loss": 0.83618265, + "num_input_tokens_seen": 37075725, + "router_z_loss_clip": 3.85546875, + "router_z_loss_mlp": 3.51757812, + "step": 1746, + "time_per_iteration": 3.063256025314331 + }, + { + "auxiliary_loss_clip": 0.01567796, + "auxiliary_loss_mlp": 0.01418789, + "balance_loss_clip": 1.18307602, + "balance_loss_mlp": 1.05696487, + "epoch": 0.2100643299464919, + "flos": 21322893320640.0, + "grad_norm": 1.8302064780183847, + "language_loss": 0.8244679, + "learning_rate": 3.669563260142321e-06, + "loss": 0.8543337, + "num_input_tokens_seen": 37094615, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 3.61523438, + "step": 1747, + "time_per_iteration": 3.912339448928833 + }, + { + "auxiliary_loss_clip": 0.01568431, + "auxiliary_loss_mlp": 0.01423283, + "balance_loss_clip": 1.18358827, + "balance_loss_mlp": 1.06393862, + "epoch": 0.21018457283713102, + "flos": 19356412335840.0, + "grad_norm": 20.026782880931634, + "language_loss": 0.84338969, + "learning_rate": 3.6691342463927083e-06, + "loss": 0.87330681, + "num_input_tokens_seen": 37113610, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 3.59179688, + "step": 1748, + "time_per_iteration": 4.020112991333008 + }, + { + "auxiliary_loss_clip": 0.01572079, + "auxiliary_loss_mlp": 0.01439203, + "balance_loss_clip": 1.18574584, + "balance_loss_mlp": 1.08958578, + "epoch": 0.2103048157277701, + "flos": 28332580304160.0, + "grad_norm": 1.701818914912486, + "language_loss": 0.82109368, + "learning_rate": 3.668704979436758e-06, + "loss": 0.85120648, + "num_input_tokens_seen": 37133705, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 3.49804688, + "step": 1749, + "time_per_iteration": 3.194803237915039 + }, + { + "auxiliary_loss_clip": 0.0157128, + "auxiliary_loss_mlp": 0.01424937, + "balance_loss_clip": 1.18474829, + "balance_loss_mlp": 1.07436657, + "epoch": 0.21042505861840918, + "flos": 17459226828000.0, + "grad_norm": 2.056204468567454, + "language_loss": 0.78636825, + "learning_rate": 3.668275459339588e-06, + "loss": 0.81633043, + "num_input_tokens_seen": 37152185, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 3.5078125, + "step": 1750, + "time_per_iteration": 3.160464286804199 + }, + { + "auxiliary_loss_clip": 0.01572432, + "auxiliary_loss_mlp": 0.01445122, + "balance_loss_clip": 1.1861763, + "balance_loss_mlp": 1.10237169, + "epoch": 0.21054530150904827, + "flos": 14211557775360.0, + "grad_norm": 1.7347279837449987, + "language_loss": 0.80159396, + "learning_rate": 3.667845686166358e-06, + "loss": 0.83176947, + "num_input_tokens_seen": 37169110, + "router_z_loss_clip": 3.86328125, + "router_z_loss_mlp": 3.42773438, + "step": 1751, + "time_per_iteration": 3.0212411880493164 + }, + { + "auxiliary_loss_clip": 0.01574379, + "auxiliary_loss_mlp": 0.01425124, + "balance_loss_clip": 1.18676686, + "balance_loss_mlp": 1.0787499, + "epoch": 0.21066554439968738, + "flos": 18619840182240.0, + "grad_norm": 1.8305518905280767, + "language_loss": 0.86121368, + "learning_rate": 3.6674156599822634e-06, + "loss": 0.89120877, + "num_input_tokens_seen": 37184905, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 3.46289062, + "step": 1752, + "time_per_iteration": 3.87943696975708 + }, + { + "auxiliary_loss_clip": 0.01574582, + "auxiliary_loss_mlp": 0.01423761, + "balance_loss_clip": 1.18841112, + "balance_loss_mlp": 1.07338107, + "epoch": 0.21078578729032646, + "flos": 23661111844800.0, + "grad_norm": 2.204242609856429, + "language_loss": 0.81737995, + "learning_rate": 3.666985380852539e-06, + "loss": 0.84736335, + "num_input_tokens_seen": 37203910, + "router_z_loss_clip": 3.86328125, + "router_z_loss_mlp": 3.50390625, + "step": 1753, + "time_per_iteration": 3.1526834964752197 + }, + { + "auxiliary_loss_clip": 0.01565542, + "auxiliary_loss_mlp": 0.01449222, + "balance_loss_clip": 1.18009877, + "balance_loss_mlp": 1.10227549, + "epoch": 0.21090603018096554, + "flos": 29348685911520.0, + "grad_norm": 2.660835297693647, + "language_loss": 0.74759316, + "learning_rate": 3.6665548488424576e-06, + "loss": 0.77774084, + "num_input_tokens_seen": 37222670, + "router_z_loss_clip": 3.85351562, + "router_z_loss_mlp": 3.47070312, + "step": 1754, + "time_per_iteration": 3.023026704788208 + }, + { + "auxiliary_loss_clip": 0.01575129, + "auxiliary_loss_mlp": 0.01435813, + "balance_loss_clip": 1.18945312, + "balance_loss_mlp": 1.09363437, + "epoch": 0.21102627307160465, + "flos": 23263658648640.0, + "grad_norm": 1.9610302047306873, + "language_loss": 0.88242263, + "learning_rate": 3.6661240640173307e-06, + "loss": 0.91253209, + "num_input_tokens_seen": 37244140, + "router_z_loss_clip": 3.85546875, + "router_z_loss_mlp": 3.42578125, + "step": 1755, + "time_per_iteration": 3.0339157581329346 + }, + { + "auxiliary_loss_clip": 0.01707237, + "auxiliary_loss_mlp": 0.01318092, + "balance_loss_clip": 1.31344056, + "balance_loss_mlp": 1.00528717, + "epoch": 0.21114651596224374, + "flos": 54639359252640.0, + "grad_norm": 1.0059330102460247, + "language_loss": 0.57889986, + "learning_rate": 3.6656930264425085e-06, + "loss": 0.60915315, + "num_input_tokens_seen": 37308185, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 3.125, + "step": 1756, + "time_per_iteration": 4.226025819778442 + }, + { + "auxiliary_loss_clip": 0.01574112, + "auxiliary_loss_mlp": 0.01433939, + "balance_loss_clip": 1.18850923, + "balance_loss_mlp": 1.08699191, + "epoch": 0.21126675885288282, + "flos": 21545457949440.0, + "grad_norm": 1.9149972746407495, + "language_loss": 0.7568081, + "learning_rate": 3.665261736183378e-06, + "loss": 0.7868886, + "num_input_tokens_seen": 37328220, + "router_z_loss_clip": 3.85546875, + "router_z_loss_mlp": 3.47070312, + "step": 1757, + "time_per_iteration": 2.952845811843872 + }, + { + "auxiliary_loss_clip": 0.01574674, + "auxiliary_loss_mlp": 0.01437486, + "balance_loss_clip": 1.18982434, + "balance_loss_mlp": 1.08710575, + "epoch": 0.2113870017435219, + "flos": 10963395656640.0, + "grad_norm": 3.3919201644603554, + "language_loss": 0.89217085, + "learning_rate": 3.664830193305366e-06, + "loss": 0.92229247, + "num_input_tokens_seen": 37345995, + "router_z_loss_clip": 3.84570312, + "router_z_loss_mlp": 3.50585938, + "step": 1758, + "time_per_iteration": 3.0191807746887207 + }, + { + "auxiliary_loss_clip": 0.01573591, + "auxiliary_loss_mlp": 0.01431006, + "balance_loss_clip": 1.18866646, + "balance_loss_mlp": 1.0794816, + "epoch": 0.211507244634161, + "flos": 16655672815200.0, + "grad_norm": 3.317781410315663, + "language_loss": 0.77028346, + "learning_rate": 3.6643983978739373e-06, + "loss": 0.80032945, + "num_input_tokens_seen": 37362610, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 3.51953125, + "step": 1759, + "time_per_iteration": 2.9889211654663086 + }, + { + "auxiliary_loss_clip": 0.01588384, + "auxiliary_loss_mlp": 0.0144633, + "balance_loss_clip": 1.2028203, + "balance_loss_mlp": 1.10129023, + "epoch": 0.2116274875248001, + "flos": 20955896801280.0, + "grad_norm": 1.8726161549933549, + "language_loss": 0.82524949, + "learning_rate": 3.663966349954596e-06, + "loss": 0.85559666, + "num_input_tokens_seen": 37382790, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 3.453125, + "step": 1760, + "time_per_iteration": 3.006120204925537 + }, + { + "auxiliary_loss_clip": 0.01707331, + "auxiliary_loss_mlp": 0.01324768, + "balance_loss_clip": 1.31635332, + "balance_loss_mlp": 1.02111816, + "epoch": 0.21174773041543918, + "flos": 68203097861760.0, + "grad_norm": 0.8053540176333623, + "language_loss": 0.59719056, + "learning_rate": 3.6635340496128816e-06, + "loss": 0.62751156, + "num_input_tokens_seen": 37439720, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 3.03125, + "step": 1761, + "time_per_iteration": 3.5148212909698486 + }, + { + "auxiliary_loss_clip": 0.01581791, + "auxiliary_loss_mlp": 0.01425745, + "balance_loss_clip": 1.1969018, + "balance_loss_mlp": 1.06620979, + "epoch": 0.2118679733060783, + "flos": 20670446554560.0, + "grad_norm": 2.435459745341804, + "language_loss": 0.92805481, + "learning_rate": 3.6631014969143747e-06, + "loss": 0.95813012, + "num_input_tokens_seen": 37459410, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 3.59375, + "step": 1762, + "time_per_iteration": 3.184717893600464 + }, + { + "auxiliary_loss_clip": 0.01585561, + "auxiliary_loss_mlp": 0.01418969, + "balance_loss_clip": 1.20263588, + "balance_loss_mlp": 1.06706357, + "epoch": 0.21198821619671737, + "flos": 23225768052480.0, + "grad_norm": 1.896507059291853, + "language_loss": 0.8889553, + "learning_rate": 3.662668691924693e-06, + "loss": 0.91900063, + "num_input_tokens_seen": 37480460, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 3.5234375, + "step": 1763, + "time_per_iteration": 3.1184847354888916 + }, + { + "auxiliary_loss_clip": 0.01585235, + "auxiliary_loss_mlp": 0.0142696, + "balance_loss_clip": 1.20187569, + "balance_loss_mlp": 1.07238376, + "epoch": 0.21210845908735645, + "flos": 24500849686560.0, + "grad_norm": 2.547048901311058, + "language_loss": 0.71767557, + "learning_rate": 3.6622356347094927e-06, + "loss": 0.74779749, + "num_input_tokens_seen": 37502025, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 3.54492188, + "step": 1764, + "time_per_iteration": 3.078874349594116 + }, + { + "auxiliary_loss_clip": 0.01582259, + "auxiliary_loss_mlp": 0.0143921, + "balance_loss_clip": 1.19888234, + "balance_loss_mlp": 1.08482504, + "epoch": 0.21222870197799554, + "flos": 27092279157120.0, + "grad_norm": 1.98961186331231, + "language_loss": 0.7906428, + "learning_rate": 3.6618023253344684e-06, + "loss": 0.82085752, + "num_input_tokens_seen": 37520885, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 3.54296875, + "step": 1765, + "time_per_iteration": 3.055663824081421 + }, + { + "auxiliary_loss_clip": 0.01578722, + "auxiliary_loss_mlp": 0.0145113, + "balance_loss_clip": 1.19520724, + "balance_loss_mlp": 1.10208476, + "epoch": 0.21234894486863465, + "flos": 16875772113600.0, + "grad_norm": 1.5265063316306242, + "language_loss": 0.83623695, + "learning_rate": 3.6613687638653527e-06, + "loss": 0.86653543, + "num_input_tokens_seen": 37539055, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 3.49414062, + "step": 1766, + "time_per_iteration": 3.0696310997009277 + }, + { + "auxiliary_loss_clip": 0.01577019, + "auxiliary_loss_mlp": 0.01432327, + "balance_loss_clip": 1.19502687, + "balance_loss_mlp": 1.07946801, + "epoch": 0.21246918775927373, + "flos": 23477082590880.0, + "grad_norm": 1.7878745099124231, + "language_loss": 0.78226811, + "learning_rate": 3.660934950367916e-06, + "loss": 0.8123616, + "num_input_tokens_seen": 37558300, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 3.53125, + "step": 1767, + "time_per_iteration": 3.1390066146850586 + }, + { + "auxiliary_loss_clip": 0.01575423, + "auxiliary_loss_mlp": 0.01430729, + "balance_loss_clip": 1.19193661, + "balance_loss_mlp": 1.07939553, + "epoch": 0.21258943064991281, + "flos": 22384475156160.0, + "grad_norm": 2.123013745301515, + "language_loss": 0.83550775, + "learning_rate": 3.660500884907968e-06, + "loss": 0.86556929, + "num_input_tokens_seen": 37579040, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 3.51367188, + "step": 1768, + "time_per_iteration": 3.0359015464782715 + }, + { + "auxiliary_loss_clip": 0.01693979, + "auxiliary_loss_mlp": 0.0130246, + "balance_loss_clip": 1.30598187, + "balance_loss_mlp": 1.01635742, + "epoch": 0.21270967354055192, + "flos": 59445815490720.0, + "grad_norm": 0.8265767061060713, + "language_loss": 0.59928191, + "learning_rate": 3.660066567551356e-06, + "loss": 0.62924635, + "num_input_tokens_seen": 37639185, + "router_z_loss_clip": 3.875, + "router_z_loss_mlp": 2.8671875, + "step": 1769, + "time_per_iteration": 3.4322235584259033 + }, + { + "auxiliary_loss_clip": 0.01569169, + "auxiliary_loss_mlp": 0.01447841, + "balance_loss_clip": 1.18532944, + "balance_loss_mlp": 1.09917736, + "epoch": 0.212829916431191, + "flos": 21546216512640.0, + "grad_norm": 2.9640507845435047, + "language_loss": 0.84649765, + "learning_rate": 3.6596319983639657e-06, + "loss": 0.87666774, + "num_input_tokens_seen": 37657765, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 3.49023438, + "step": 1770, + "time_per_iteration": 3.069009780883789 + }, + { + "auxiliary_loss_clip": 0.01572946, + "auxiliary_loss_mlp": 0.01432647, + "balance_loss_clip": 1.19183326, + "balance_loss_mlp": 1.076545, + "epoch": 0.2129501593218301, + "flos": 28990374940800.0, + "grad_norm": 1.836177419320035, + "language_loss": 0.86814141, + "learning_rate": 3.6591971774117214e-06, + "loss": 0.89819735, + "num_input_tokens_seen": 37680740, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 3.5625, + "step": 1771, + "time_per_iteration": 3.021205425262451 + }, + { + "auxiliary_loss_clip": 0.0157596, + "auxiliary_loss_mlp": 0.01434423, + "balance_loss_clip": 1.19342029, + "balance_loss_mlp": 1.0760324, + "epoch": 0.2130704022124692, + "flos": 18808989737760.0, + "grad_norm": 2.0594479731030133, + "language_loss": 0.8020041, + "learning_rate": 3.6587621047605833e-06, + "loss": 0.83210796, + "num_input_tokens_seen": 37697910, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 3.5859375, + "step": 1772, + "time_per_iteration": 3.0527970790863037 + }, + { + "auxiliary_loss_clip": 0.01573823, + "auxiliary_loss_mlp": 0.01447602, + "balance_loss_clip": 1.18917465, + "balance_loss_mlp": 1.09989262, + "epoch": 0.21319064510310828, + "flos": 13919887310400.0, + "grad_norm": 2.5085886949003227, + "language_loss": 0.87301409, + "learning_rate": 3.6583267804765542e-06, + "loss": 0.9032284, + "num_input_tokens_seen": 37712245, + "router_z_loss_clip": 3.84570312, + "router_z_loss_mlp": 3.47851562, + "step": 1773, + "time_per_iteration": 3.086066961288452 + }, + { + "auxiliary_loss_clip": 0.01577393, + "auxiliary_loss_mlp": 0.01448627, + "balance_loss_clip": 1.19287467, + "balance_loss_mlp": 1.09462357, + "epoch": 0.21331088799374737, + "flos": 20961813594240.0, + "grad_norm": 2.2724387420067584, + "language_loss": 0.85909963, + "learning_rate": 3.6578912046256702e-06, + "loss": 0.88935983, + "num_input_tokens_seen": 37730765, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 3.54101562, + "step": 1774, + "time_per_iteration": 3.8643620014190674 + }, + { + "auxiliary_loss_clip": 0.01571937, + "auxiliary_loss_mlp": 0.01434441, + "balance_loss_clip": 1.18686461, + "balance_loss_mlp": 1.07566857, + "epoch": 0.21343113088438645, + "flos": 18626629322880.0, + "grad_norm": 2.0184197384377125, + "language_loss": 0.7663157, + "learning_rate": 3.6574553772740083e-06, + "loss": 0.79637945, + "num_input_tokens_seen": 37748695, + "router_z_loss_clip": 3.85351562, + "router_z_loss_mlp": 3.58789062, + "step": 1775, + "time_per_iteration": 3.8592023849487305 + }, + { + "auxiliary_loss_clip": 0.01679746, + "auxiliary_loss_mlp": 0.01299972, + "balance_loss_clip": 1.2888813, + "balance_loss_mlp": 1.01234436, + "epoch": 0.21355137377502556, + "flos": 67420290552480.0, + "grad_norm": 0.8601364785277359, + "language_loss": 0.6181466, + "learning_rate": 3.657019298487684e-06, + "loss": 0.64794385, + "num_input_tokens_seen": 37813705, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 2.8828125, + "step": 1776, + "time_per_iteration": 3.483375310897827 + }, + { + "auxiliary_loss_clip": 0.01570253, + "auxiliary_loss_mlp": 0.01420847, + "balance_loss_clip": 1.18656564, + "balance_loss_mlp": 1.05616236, + "epoch": 0.21367161666566464, + "flos": 34534958387040.0, + "grad_norm": 1.679206463501675, + "language_loss": 0.84494519, + "learning_rate": 3.6565829683328495e-06, + "loss": 0.87485617, + "num_input_tokens_seen": 37836330, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 3.64453125, + "step": 1777, + "time_per_iteration": 3.0528042316436768 + }, + { + "auxiliary_loss_clip": 0.01566425, + "auxiliary_loss_mlp": 0.01429267, + "balance_loss_clip": 1.18219972, + "balance_loss_mlp": 1.06973135, + "epoch": 0.21379185955630373, + "flos": 18991350152640.0, + "grad_norm": 1.9342220730585529, + "language_loss": 0.86009121, + "learning_rate": 3.6561463868756965e-06, + "loss": 0.89004815, + "num_input_tokens_seen": 37855030, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 3.59570312, + "step": 1778, + "time_per_iteration": 3.025029182434082 + }, + { + "auxiliary_loss_clip": 0.01571827, + "auxiliary_loss_mlp": 0.01428627, + "balance_loss_clip": 1.18811822, + "balance_loss_mlp": 1.06985497, + "epoch": 0.21391210244694284, + "flos": 28220918708160.0, + "grad_norm": 1.7051840514374983, + "language_loss": 0.77946419, + "learning_rate": 3.655709554182452e-06, + "loss": 0.80946875, + "num_input_tokens_seen": 37875370, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 3.58789062, + "step": 1779, + "time_per_iteration": 3.0250356197357178 + }, + { + "auxiliary_loss_clip": 0.01570366, + "auxiliary_loss_mlp": 0.01461989, + "balance_loss_clip": 1.18533206, + "balance_loss_mlp": 1.10798526, + "epoch": 0.21403234533758192, + "flos": 17456875282080.0, + "grad_norm": 1.8672445963891182, + "language_loss": 0.84737444, + "learning_rate": 3.6552724703193855e-06, + "loss": 0.87769794, + "num_input_tokens_seen": 37892560, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 3.54101562, + "step": 1780, + "time_per_iteration": 3.8915159702301025 + }, + { + "auxiliary_loss_clip": 0.01669828, + "auxiliary_loss_mlp": 0.01342651, + "balance_loss_clip": 1.28280616, + "balance_loss_mlp": 1.06646729, + "epoch": 0.214152588228221, + "flos": 51643118522880.0, + "grad_norm": 0.7909871027515328, + "language_loss": 0.55958164, + "learning_rate": 3.654835135352801e-06, + "loss": 0.58970642, + "num_input_tokens_seen": 37947370, + "router_z_loss_clip": 3.875, + "router_z_loss_mlp": 2.765625, + "step": 1781, + "time_per_iteration": 3.406522512435913 + }, + { + "auxiliary_loss_clip": 0.01563241, + "auxiliary_loss_mlp": 0.01429308, + "balance_loss_clip": 1.18055975, + "balance_loss_mlp": 1.07682991, + "epoch": 0.21427283111886009, + "flos": 19497961686240.0, + "grad_norm": 1.94013771649536, + "language_loss": 0.87524068, + "learning_rate": 3.654397549349043e-06, + "loss": 0.90516615, + "num_input_tokens_seen": 37964745, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 3.52734375, + "step": 1782, + "time_per_iteration": 3.1099417209625244 + }, + { + "auxiliary_loss_clip": 0.01571167, + "auxiliary_loss_mlp": 0.01442317, + "balance_loss_clip": 1.18796325, + "balance_loss_mlp": 1.08907568, + "epoch": 0.2143930740094992, + "flos": 20086915983840.0, + "grad_norm": 2.044456405078045, + "language_loss": 0.75131476, + "learning_rate": 3.653959712374491e-06, + "loss": 0.78144962, + "num_input_tokens_seen": 37982850, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 3.53320312, + "step": 1783, + "time_per_iteration": 3.7789034843444824 + }, + { + "auxiliary_loss_clip": 0.01563332, + "auxiliary_loss_mlp": 0.01411505, + "balance_loss_clip": 1.18159389, + "balance_loss_mlp": 1.0580734, + "epoch": 0.21451331690013828, + "flos": 21800754944640.0, + "grad_norm": 1.7138170250787435, + "language_loss": 0.8334502, + "learning_rate": 3.6535216244955663e-06, + "loss": 0.86319864, + "num_input_tokens_seen": 38002745, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 3.5390625, + "step": 1784, + "time_per_iteration": 3.1888561248779297 + }, + { + "auxiliary_loss_clip": 0.0155712, + "auxiliary_loss_mlp": 0.01428414, + "balance_loss_clip": 1.17577088, + "balance_loss_mlp": 1.0782249, + "epoch": 0.21463355979077736, + "flos": 32856658476480.0, + "grad_norm": 2.6150127766677973, + "language_loss": 0.71318591, + "learning_rate": 3.653083285778726e-06, + "loss": 0.74304128, + "num_input_tokens_seen": 38024115, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 3.50390625, + "step": 1785, + "time_per_iteration": 3.1773762702941895 + }, + { + "auxiliary_loss_clip": 0.01556052, + "auxiliary_loss_mlp": 0.01458524, + "balance_loss_clip": 1.176211, + "balance_loss_mlp": 1.11062312, + "epoch": 0.21475380268141647, + "flos": 21545988943680.0, + "grad_norm": 2.3091694273386922, + "language_loss": 0.81315935, + "learning_rate": 3.6526446962904653e-06, + "loss": 0.84330511, + "num_input_tokens_seen": 38042830, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 3.48242188, + "step": 1786, + "time_per_iteration": 2.951392650604248 + }, + { + "auxiliary_loss_clip": 0.01559778, + "auxiliary_loss_mlp": 0.01439818, + "balance_loss_clip": 1.17631269, + "balance_loss_mlp": 1.0892477, + "epoch": 0.21487404557205556, + "flos": 32161731806880.0, + "grad_norm": 1.4606250230283484, + "language_loss": 0.74374759, + "learning_rate": 3.652205856097318e-06, + "loss": 0.77374351, + "num_input_tokens_seen": 38066015, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 3.50976562, + "step": 1787, + "time_per_iteration": 3.286679267883301 + }, + { + "auxiliary_loss_clip": 0.01556443, + "auxiliary_loss_mlp": 0.01453848, + "balance_loss_clip": 1.17470527, + "balance_loss_mlp": 1.10098886, + "epoch": 0.21499428846269464, + "flos": 12674503789920.0, + "grad_norm": 5.248945046590298, + "language_loss": 0.79541773, + "learning_rate": 3.651766765265856e-06, + "loss": 0.82552063, + "num_input_tokens_seen": 38083025, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 3.52929688, + "step": 1788, + "time_per_iteration": 3.092542886734009 + }, + { + "auxiliary_loss_clip": 0.01557313, + "auxiliary_loss_mlp": 0.01447847, + "balance_loss_clip": 1.17497802, + "balance_loss_mlp": 1.09460568, + "epoch": 0.21511453135333372, + "flos": 23473517343840.0, + "grad_norm": 2.294789784720231, + "language_loss": 0.81174326, + "learning_rate": 3.65132742386269e-06, + "loss": 0.84179485, + "num_input_tokens_seen": 38098245, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 3.53320312, + "step": 1789, + "time_per_iteration": 3.01017689704895 + }, + { + "auxiliary_loss_clip": 0.015512, + "auxiliary_loss_mlp": 0.01448483, + "balance_loss_clip": 1.16861725, + "balance_loss_mlp": 1.10153627, + "epoch": 0.21523477424397283, + "flos": 26946026714880.0, + "grad_norm": 2.7761565099354897, + "language_loss": 0.84989977, + "learning_rate": 3.6508878319544656e-06, + "loss": 0.87989664, + "num_input_tokens_seen": 38118460, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 3.47460938, + "step": 1790, + "time_per_iteration": 3.0255496501922607 + }, + { + "auxiliary_loss_clip": 0.01561765, + "auxiliary_loss_mlp": 0.01432457, + "balance_loss_clip": 1.1791153, + "balance_loss_mlp": 1.07940674, + "epoch": 0.21535501713461191, + "flos": 18918148075200.0, + "grad_norm": 2.8076173681537693, + "language_loss": 0.81468445, + "learning_rate": 3.65044798960787e-06, + "loss": 0.84462667, + "num_input_tokens_seen": 38136800, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 3.53125, + "step": 1791, + "time_per_iteration": 3.0824573040008545 + }, + { + "auxiliary_loss_clip": 0.01561126, + "auxiliary_loss_mlp": 0.01448364, + "balance_loss_clip": 1.17926049, + "balance_loss_mlp": 1.10294271, + "epoch": 0.215475260025251, + "flos": 17897073878880.0, + "grad_norm": 1.9474045416727672, + "language_loss": 0.78716934, + "learning_rate": 3.650007896889627e-06, + "loss": 0.8172642, + "num_input_tokens_seen": 38155380, + "router_z_loss_clip": 3.81835938, + "router_z_loss_mlp": 3.45898438, + "step": 1792, + "time_per_iteration": 3.2005603313446045 + }, + { + "auxiliary_loss_clip": 0.01558921, + "auxiliary_loss_mlp": 0.01456388, + "balance_loss_clip": 1.17608762, + "balance_loss_mlp": 1.10886884, + "epoch": 0.2155955029158901, + "flos": 16656165881280.0, + "grad_norm": 2.828311703268814, + "language_loss": 0.80839849, + "learning_rate": 3.6495675538664974e-06, + "loss": 0.83855158, + "num_input_tokens_seen": 38174395, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 3.4765625, + "step": 1793, + "time_per_iteration": 2.9839982986450195 + }, + { + "auxiliary_loss_clip": 0.01554492, + "auxiliary_loss_mlp": 0.01438479, + "balance_loss_clip": 1.17056441, + "balance_loss_mlp": 1.09038782, + "epoch": 0.2157157458065292, + "flos": 23623372961280.0, + "grad_norm": 1.8549405158754717, + "language_loss": 0.82925403, + "learning_rate": 3.649126960605282e-06, + "loss": 0.85918379, + "num_input_tokens_seen": 38195380, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 3.48242188, + "step": 1794, + "time_per_iteration": 2.9953181743621826 + }, + { + "auxiliary_loss_clip": 0.01556694, + "auxiliary_loss_mlp": 0.01421098, + "balance_loss_clip": 1.17472982, + "balance_loss_mlp": 1.06347013, + "epoch": 0.21583598869716827, + "flos": 22129254017280.0, + "grad_norm": 2.447669363056806, + "language_loss": 0.83924359, + "learning_rate": 3.6486861171728174e-06, + "loss": 0.86902148, + "num_input_tokens_seen": 38213775, + "router_z_loss_clip": 3.81835938, + "router_z_loss_mlp": 3.578125, + "step": 1795, + "time_per_iteration": 3.0762712955474854 + }, + { + "auxiliary_loss_clip": 0.01556319, + "auxiliary_loss_mlp": 0.0142905, + "balance_loss_clip": 1.17368364, + "balance_loss_mlp": 1.07981491, + "epoch": 0.21595623158780738, + "flos": 23443515804960.0, + "grad_norm": 1.6360848582897902, + "language_loss": 0.78719473, + "learning_rate": 3.6482450236359803e-06, + "loss": 0.81704849, + "num_input_tokens_seen": 38235630, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 3.49609375, + "step": 1796, + "time_per_iteration": 2.9876620769500732 + }, + { + "auxiliary_loss_clip": 0.01560128, + "auxiliary_loss_mlp": 0.01417289, + "balance_loss_clip": 1.17921686, + "balance_loss_mlp": 1.06404805, + "epoch": 0.21607647447844647, + "flos": 26908932610080.0, + "grad_norm": 2.8264195246917776, + "language_loss": 0.77968943, + "learning_rate": 3.647803680061683e-06, + "loss": 0.80946362, + "num_input_tokens_seen": 38256045, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 3.53320312, + "step": 1797, + "time_per_iteration": 3.002686023712158 + }, + { + "auxiliary_loss_clip": 0.0155783, + "auxiliary_loss_mlp": 0.01404132, + "balance_loss_clip": 1.17574227, + "balance_loss_mlp": 1.05546832, + "epoch": 0.21619671736908555, + "flos": 14496780453120.0, + "grad_norm": 2.7257843784174987, + "language_loss": 0.75327671, + "learning_rate": 3.6473620865168776e-06, + "loss": 0.78289628, + "num_input_tokens_seen": 38272915, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 3.48828125, + "step": 1798, + "time_per_iteration": 2.9842917919158936 + }, + { + "auxiliary_loss_clip": 0.01561093, + "auxiliary_loss_mlp": 0.01416969, + "balance_loss_clip": 1.17883611, + "balance_loss_mlp": 1.06182098, + "epoch": 0.21631696025972463, + "flos": 17933030138880.0, + "grad_norm": 1.8853329106713579, + "language_loss": 0.81845427, + "learning_rate": 3.646920243068554e-06, + "loss": 0.84823495, + "num_input_tokens_seen": 38290810, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 3.5546875, + "step": 1799, + "time_per_iteration": 3.0754497051239014 + }, + { + "auxiliary_loss_clip": 0.01558514, + "auxiliary_loss_mlp": 0.01419703, + "balance_loss_clip": 1.17675126, + "balance_loss_mlp": 1.06283844, + "epoch": 0.21643720315036374, + "flos": 24464703785760.0, + "grad_norm": 2.435167734754955, + "language_loss": 0.74852794, + "learning_rate": 3.6464781497837384e-06, + "loss": 0.77831018, + "num_input_tokens_seen": 38312785, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 3.5703125, + "step": 1800, + "time_per_iteration": 3.0597002506256104 + }, + { + "auxiliary_loss_clip": 0.01552422, + "auxiliary_loss_mlp": 0.01420028, + "balance_loss_clip": 1.17133737, + "balance_loss_mlp": 1.05572462, + "epoch": 0.21655744604100283, + "flos": 28476898410240.0, + "grad_norm": 1.6877241934548988, + "language_loss": 0.72317469, + "learning_rate": 3.6460358067294965e-06, + "loss": 0.75289929, + "num_input_tokens_seen": 38334015, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 3.640625, + "step": 1801, + "time_per_iteration": 3.0723330974578857 + }, + { + "auxiliary_loss_clip": 0.01553129, + "auxiliary_loss_mlp": 0.01417497, + "balance_loss_clip": 1.17213225, + "balance_loss_mlp": 1.06292081, + "epoch": 0.2166776889316419, + "flos": 20154808118880.0, + "grad_norm": 2.8635201103023356, + "language_loss": 0.7792263, + "learning_rate": 3.645593213972932e-06, + "loss": 0.80893254, + "num_input_tokens_seen": 38352920, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.54882812, + "step": 1802, + "time_per_iteration": 3.9703431129455566 + }, + { + "auxiliary_loss_clip": 0.01559273, + "auxiliary_loss_mlp": 0.01425356, + "balance_loss_clip": 1.1782726, + "balance_loss_mlp": 1.06067085, + "epoch": 0.21679793182228102, + "flos": 15195196513440.0, + "grad_norm": 2.1075399544484923, + "language_loss": 0.80048418, + "learning_rate": 3.6451503715811852e-06, + "loss": 0.83033049, + "num_input_tokens_seen": 38371230, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 3.64648438, + "step": 1803, + "time_per_iteration": 3.116654396057129 + }, + { + "auxiliary_loss_clip": 0.01556042, + "auxiliary_loss_mlp": 0.01416963, + "balance_loss_clip": 1.17412424, + "balance_loss_mlp": 1.05265963, + "epoch": 0.2169181747129201, + "flos": 17386404032160.0, + "grad_norm": 2.2799152670562757, + "language_loss": 0.80417275, + "learning_rate": 3.6447072796214345e-06, + "loss": 0.83390284, + "num_input_tokens_seen": 38389795, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 3.640625, + "step": 1804, + "time_per_iteration": 3.926414966583252 + }, + { + "auxiliary_loss_clip": 0.01637598, + "auxiliary_loss_mlp": 0.01291931, + "balance_loss_clip": 1.25472248, + "balance_loss_mlp": 1.02337646, + "epoch": 0.21703841760355919, + "flos": 58767349642560.0, + "grad_norm": 0.9535205360306862, + "language_loss": 0.63247252, + "learning_rate": 3.644263938160898e-06, + "loss": 0.66176784, + "num_input_tokens_seen": 38445760, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 2.6875, + "step": 1805, + "time_per_iteration": 3.3833210468292236 + }, + { + "auxiliary_loss_clip": 0.01559224, + "auxiliary_loss_mlp": 0.01418986, + "balance_loss_clip": 1.17778611, + "balance_loss_mlp": 1.06135786, + "epoch": 0.21715866049419827, + "flos": 22420621056960.0, + "grad_norm": 2.0567687603070666, + "language_loss": 0.71982551, + "learning_rate": 3.6438203472668293e-06, + "loss": 0.74960768, + "num_input_tokens_seen": 38465405, + "router_z_loss_clip": 3.81835938, + "router_z_loss_mlp": 3.57421875, + "step": 1806, + "time_per_iteration": 3.206968069076538 + }, + { + "auxiliary_loss_clip": 0.01558178, + "auxiliary_loss_mlp": 0.01410517, + "balance_loss_clip": 1.17690957, + "balance_loss_mlp": 1.0507915, + "epoch": 0.21727890338483738, + "flos": 17239620595680.0, + "grad_norm": 1.9017177454485688, + "language_loss": 0.82011217, + "learning_rate": 3.6433765070065206e-06, + "loss": 0.84979916, + "num_input_tokens_seen": 38483195, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 3.59375, + "step": 1807, + "time_per_iteration": 3.9642717838287354 + }, + { + "auxiliary_loss_clip": 0.01555097, + "auxiliary_loss_mlp": 0.01403472, + "balance_loss_clip": 1.17269897, + "balance_loss_mlp": 1.04145694, + "epoch": 0.21739914627547646, + "flos": 13435995108960.0, + "grad_norm": 2.8954586999074956, + "language_loss": 0.87643862, + "learning_rate": 3.6429324174473025e-06, + "loss": 0.90602434, + "num_input_tokens_seen": 38496735, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 3.62109375, + "step": 1808, + "time_per_iteration": 2.9148197174072266 + }, + { + "auxiliary_loss_clip": 0.01560225, + "auxiliary_loss_mlp": 0.01411761, + "balance_loss_clip": 1.17946565, + "balance_loss_mlp": 1.04822028, + "epoch": 0.21751938916611555, + "flos": 20961623953440.0, + "grad_norm": 2.815274699550432, + "language_loss": 0.85582006, + "learning_rate": 3.6424880786565425e-06, + "loss": 0.88553995, + "num_input_tokens_seen": 38512880, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 3.6328125, + "step": 1809, + "time_per_iteration": 3.0016326904296875 + }, + { + "auxiliary_loss_clip": 0.0156091, + "auxiliary_loss_mlp": 0.01414474, + "balance_loss_clip": 1.17977953, + "balance_loss_mlp": 1.05913544, + "epoch": 0.21763963205675466, + "flos": 27602418009600.0, + "grad_norm": 2.0939200838161516, + "language_loss": 0.80367947, + "learning_rate": 3.6420434907016482e-06, + "loss": 0.83343327, + "num_input_tokens_seen": 38532570, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 3.55664062, + "step": 1810, + "time_per_iteration": 3.0019001960754395 + }, + { + "auxiliary_loss_clip": 0.01566788, + "auxiliary_loss_mlp": 0.01408775, + "balance_loss_clip": 1.18542767, + "balance_loss_mlp": 1.04656947, + "epoch": 0.21775987494739374, + "flos": 21432355083360.0, + "grad_norm": 2.0430744277212645, + "language_loss": 0.81169927, + "learning_rate": 3.6415986536500606e-06, + "loss": 0.84145486, + "num_input_tokens_seen": 38550900, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 3.62109375, + "step": 1811, + "time_per_iteration": 3.8625872135162354 + }, + { + "auxiliary_loss_clip": 0.01572721, + "auxiliary_loss_mlp": 0.01416202, + "balance_loss_clip": 1.19358981, + "balance_loss_mlp": 1.05247092, + "epoch": 0.21788011783803282, + "flos": 18334958857920.0, + "grad_norm": 1.668674074981156, + "language_loss": 0.80905509, + "learning_rate": 3.641153567569263e-06, + "loss": 0.83894432, + "num_input_tokens_seen": 38569215, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 3.63671875, + "step": 1812, + "time_per_iteration": 3.0249147415161133 + }, + { + "auxiliary_loss_clip": 0.01565528, + "auxiliary_loss_mlp": 0.01434967, + "balance_loss_clip": 1.18543553, + "balance_loss_mlp": 1.08153498, + "epoch": 0.2180003607286719, + "flos": 30265342790400.0, + "grad_norm": 2.155082235816307, + "language_loss": 0.96398425, + "learning_rate": 3.640708232526774e-06, + "loss": 0.99398923, + "num_input_tokens_seen": 38587870, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.53320312, + "step": 1813, + "time_per_iteration": 3.0089385509490967 + }, + { + "auxiliary_loss_clip": 0.01571514, + "auxiliary_loss_mlp": 0.01433382, + "balance_loss_clip": 1.19284916, + "balance_loss_mlp": 1.08586264, + "epoch": 0.21812060361931102, + "flos": 25482440304000.0, + "grad_norm": 1.8820519178013182, + "language_loss": 0.78790474, + "learning_rate": 3.6402626485901504e-06, + "loss": 0.81795371, + "num_input_tokens_seen": 38606965, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 3.4765625, + "step": 1814, + "time_per_iteration": 3.0351717472076416 + }, + { + "auxiliary_loss_clip": 0.01572664, + "auxiliary_loss_mlp": 0.01421241, + "balance_loss_clip": 1.19313836, + "balance_loss_mlp": 1.07849073, + "epoch": 0.2182408465099501, + "flos": 21910330491840.0, + "grad_norm": 3.109205170421495, + "language_loss": 0.7874282, + "learning_rate": 3.639816815826988e-06, + "loss": 0.81736726, + "num_input_tokens_seen": 38626290, + "router_z_loss_clip": 3.796875, + "router_z_loss_mlp": 3.43164062, + "step": 1815, + "time_per_iteration": 2.97609543800354 + }, + { + "auxiliary_loss_clip": 0.01567603, + "auxiliary_loss_mlp": 0.01427151, + "balance_loss_clip": 1.18817282, + "balance_loss_mlp": 1.07944107, + "epoch": 0.21836108940058918, + "flos": 23659480933920.0, + "grad_norm": 2.3137612854445737, + "language_loss": 0.778234, + "learning_rate": 3.6393707343049176e-06, + "loss": 0.80818158, + "num_input_tokens_seen": 38646620, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 3.48046875, + "step": 1816, + "time_per_iteration": 3.015979290008545 + }, + { + "auxiliary_loss_clip": 0.01559908, + "auxiliary_loss_mlp": 0.01439049, + "balance_loss_clip": 1.17889774, + "balance_loss_mlp": 1.0962981, + "epoch": 0.2184813322912283, + "flos": 24683323885920.0, + "grad_norm": 2.2271151396008553, + "language_loss": 0.73131597, + "learning_rate": 3.6389244040916104e-06, + "loss": 0.76130557, + "num_input_tokens_seen": 38665695, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 3.4296875, + "step": 1817, + "time_per_iteration": 3.131986618041992 + }, + { + "auxiliary_loss_clip": 0.01565344, + "auxiliary_loss_mlp": 0.01455607, + "balance_loss_clip": 1.18460202, + "balance_loss_mlp": 1.103701, + "epoch": 0.21860157518186737, + "flos": 26576868290400.0, + "grad_norm": 2.223523089757296, + "language_loss": 0.79483962, + "learning_rate": 3.6384778252547747e-06, + "loss": 0.82504916, + "num_input_tokens_seen": 38681575, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 3.51953125, + "step": 1818, + "time_per_iteration": 3.0707294940948486 + }, + { + "auxiliary_loss_clip": 0.01575972, + "auxiliary_loss_mlp": 0.01530487, + "balance_loss_clip": 1.19697809, + "balance_loss_mlp": 1.21882665, + "epoch": 0.21872181807250646, + "flos": 20888383947840.0, + "grad_norm": 2.1791035779772567, + "language_loss": 0.77740353, + "learning_rate": 3.638030997862155e-06, + "loss": 0.80846816, + "num_input_tokens_seen": 38700510, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 3.11328125, + "step": 1819, + "time_per_iteration": 3.026160478591919 + }, + { + "auxiliary_loss_clip": 0.01675732, + "auxiliary_loss_mlp": 0.01460625, + "balance_loss_clip": 1.29465055, + "balance_loss_mlp": 1.20275116, + "epoch": 0.21884206096314554, + "flos": 61216205702400.0, + "grad_norm": 0.8026683075006215, + "language_loss": 0.59362751, + "learning_rate": 3.6375839219815356e-06, + "loss": 0.62499106, + "num_input_tokens_seen": 38758310, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 2.578125, + "step": 1820, + "time_per_iteration": 3.471573829650879 + }, + { + "auxiliary_loss_clip": 0.01568164, + "auxiliary_loss_mlp": 0.01493157, + "balance_loss_clip": 1.1876061, + "balance_loss_mlp": 1.18130517, + "epoch": 0.21896230385378465, + "flos": 23475906817920.0, + "grad_norm": 2.4636764845047194, + "language_loss": 0.82584178, + "learning_rate": 3.6371365976807375e-06, + "loss": 0.85645497, + "num_input_tokens_seen": 38778705, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.11523438, + "step": 1821, + "time_per_iteration": 3.0160248279571533 + }, + { + "auxiliary_loss_clip": 0.01571002, + "auxiliary_loss_mlp": 0.01488795, + "balance_loss_clip": 1.18899906, + "balance_loss_mlp": 1.17637157, + "epoch": 0.21908254674442373, + "flos": 25084228544640.0, + "grad_norm": 2.0382890993438285, + "language_loss": 0.83870101, + "learning_rate": 3.6366890250276185e-06, + "loss": 0.86929899, + "num_input_tokens_seen": 38799660, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 3.12109375, + "step": 1822, + "time_per_iteration": 3.0829403400421143 + }, + { + "auxiliary_loss_clip": 0.01561422, + "auxiliary_loss_mlp": 0.01453659, + "balance_loss_clip": 1.17881894, + "balance_loss_mlp": 1.13780189, + "epoch": 0.21920278963506282, + "flos": 23516035175520.0, + "grad_norm": 2.7399968757337994, + "language_loss": 0.90028584, + "learning_rate": 3.6362412040900764e-06, + "loss": 0.93043661, + "num_input_tokens_seen": 38819450, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 3.15625, + "step": 1823, + "time_per_iteration": 3.051949977874756 + }, + { + "auxiliary_loss_clip": 0.01561171, + "auxiliary_loss_mlp": 0.01439353, + "balance_loss_clip": 1.18015373, + "balance_loss_mlp": 1.11968136, + "epoch": 0.21932303252570193, + "flos": 29244837516480.0, + "grad_norm": 1.9541175192635571, + "language_loss": 0.80703247, + "learning_rate": 3.635793134936044e-06, + "loss": 0.83703774, + "num_input_tokens_seen": 38840460, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 3.1953125, + "step": 1824, + "time_per_iteration": 3.0459144115448 + }, + { + "auxiliary_loss_clip": 0.01566052, + "auxiliary_loss_mlp": 0.01435895, + "balance_loss_clip": 1.18447828, + "balance_loss_mlp": 1.12137306, + "epoch": 0.219443275416341, + "flos": 20808582370560.0, + "grad_norm": 1.7069219550957726, + "language_loss": 0.73362988, + "learning_rate": 3.635344817633494e-06, + "loss": 0.76364934, + "num_input_tokens_seen": 38859775, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.14257812, + "step": 1825, + "time_per_iteration": 3.0650527477264404 + }, + { + "auxiliary_loss_clip": 0.01564241, + "auxiliary_loss_mlp": 0.01417221, + "balance_loss_clip": 1.18197465, + "balance_loss_mlp": 1.09907532, + "epoch": 0.2195635183069801, + "flos": 14503304096640.0, + "grad_norm": 1.9112897169268879, + "language_loss": 0.7529152, + "learning_rate": 3.634896252250436e-06, + "loss": 0.7827298, + "num_input_tokens_seen": 38876540, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 3.1796875, + "step": 1826, + "time_per_iteration": 2.9733049869537354 + }, + { + "auxiliary_loss_clip": 0.01561783, + "auxiliary_loss_mlp": 0.01391103, + "balance_loss_clip": 1.18228972, + "balance_loss_mlp": 1.0685699, + "epoch": 0.2196837611976192, + "flos": 24245780260320.0, + "grad_norm": 1.8755993312106796, + "language_loss": 0.82527626, + "learning_rate": 3.6344474388549157e-06, + "loss": 0.85480511, + "num_input_tokens_seen": 38896195, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 3.22460938, + "step": 1827, + "time_per_iteration": 3.0547420978546143 + }, + { + "auxiliary_loss_clip": 0.01561472, + "auxiliary_loss_mlp": 0.01408709, + "balance_loss_clip": 1.18267953, + "balance_loss_mlp": 1.08674884, + "epoch": 0.2198040040882583, + "flos": 18076324184640.0, + "grad_norm": 2.350099473297804, + "language_loss": 0.79891443, + "learning_rate": 3.6339983775150183e-06, + "loss": 0.82861614, + "num_input_tokens_seen": 38912755, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.21875, + "step": 1828, + "time_per_iteration": 3.0208141803741455 + }, + { + "auxiliary_loss_clip": 0.01561637, + "auxiliary_loss_mlp": 0.01380153, + "balance_loss_clip": 1.18159389, + "balance_loss_mlp": 1.05762029, + "epoch": 0.21992424697889737, + "flos": 17786057061600.0, + "grad_norm": 2.6049528076574617, + "language_loss": 0.84086967, + "learning_rate": 3.6335490682988664e-06, + "loss": 0.87028754, + "num_input_tokens_seen": 38928365, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 3.22460938, + "step": 1829, + "time_per_iteration": 3.914886951446533 + }, + { + "auxiliary_loss_clip": 0.01567146, + "auxiliary_loss_mlp": 0.01397283, + "balance_loss_clip": 1.18626666, + "balance_loss_mlp": 1.07112622, + "epoch": 0.22004448986953645, + "flos": 17640373541760.0, + "grad_norm": 2.911412441333116, + "language_loss": 0.82733643, + "learning_rate": 3.63309951127462e-06, + "loss": 0.85698074, + "num_input_tokens_seen": 38945275, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 3.26171875, + "step": 1830, + "time_per_iteration": 3.1360607147216797 + }, + { + "auxiliary_loss_clip": 0.01568224, + "auxiliary_loss_mlp": 0.01386082, + "balance_loss_clip": 1.18641162, + "balance_loss_mlp": 1.05935287, + "epoch": 0.22016473276017556, + "flos": 22277858005440.0, + "grad_norm": 2.4837390852128305, + "language_loss": 0.7532987, + "learning_rate": 3.6326497065104757e-06, + "loss": 0.7828418, + "num_input_tokens_seen": 38965740, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 3.26757812, + "step": 1831, + "time_per_iteration": 3.8322360515594482 + }, + { + "auxiliary_loss_clip": 0.01565162, + "auxiliary_loss_mlp": 0.01383522, + "balance_loss_clip": 1.18345344, + "balance_loss_mlp": 1.05374193, + "epoch": 0.22028497565081465, + "flos": 25558600777920.0, + "grad_norm": 3.4005198620296353, + "language_loss": 0.78315514, + "learning_rate": 3.6321996540746697e-06, + "loss": 0.81264198, + "num_input_tokens_seen": 38984815, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.29882812, + "step": 1832, + "time_per_iteration": 2.9307687282562256 + }, + { + "auxiliary_loss_clip": 0.01561511, + "auxiliary_loss_mlp": 0.01395044, + "balance_loss_clip": 1.18029547, + "balance_loss_mlp": 1.06507289, + "epoch": 0.22040521854145373, + "flos": 36250314474240.0, + "grad_norm": 2.4241963205861796, + "language_loss": 0.80561352, + "learning_rate": 3.6317493540354733e-06, + "loss": 0.83517909, + "num_input_tokens_seen": 39008230, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.30078125, + "step": 1833, + "time_per_iteration": 3.0752675533294678 + }, + { + "auxiliary_loss_clip": 0.01565405, + "auxiliary_loss_mlp": 0.01387484, + "balance_loss_clip": 1.1844368, + "balance_loss_mlp": 1.05694079, + "epoch": 0.22052546143209284, + "flos": 11840037962400.0, + "grad_norm": 1.9238391397334753, + "language_loss": 0.76756811, + "learning_rate": 3.6312988064611976e-06, + "loss": 0.79709703, + "num_input_tokens_seen": 39026540, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 3.30664062, + "step": 1834, + "time_per_iteration": 2.985495090484619 + }, + { + "auxiliary_loss_clip": 0.01563387, + "auxiliary_loss_mlp": 0.0138357, + "balance_loss_clip": 1.18391514, + "balance_loss_mlp": 1.05111921, + "epoch": 0.22064570432273192, + "flos": 24211682480160.0, + "grad_norm": 1.7952701642305589, + "language_loss": 0.81508958, + "learning_rate": 3.6308480114201896e-06, + "loss": 0.84455919, + "num_input_tokens_seen": 39048460, + "router_z_loss_clip": 3.796875, + "router_z_loss_mlp": 3.32617188, + "step": 1835, + "time_per_iteration": 3.8056581020355225 + }, + { + "auxiliary_loss_clip": 0.01558827, + "auxiliary_loss_mlp": 0.01376284, + "balance_loss_clip": 1.17833948, + "balance_loss_mlp": 1.04688537, + "epoch": 0.220765947213371, + "flos": 17933295636000.0, + "grad_norm": 1.7985267751787928, + "language_loss": 0.76965749, + "learning_rate": 3.630396968980835e-06, + "loss": 0.79900861, + "num_input_tokens_seen": 39066335, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.29492188, + "step": 1836, + "time_per_iteration": 2.9761669635772705 + }, + { + "auxiliary_loss_clip": 0.01563475, + "auxiliary_loss_mlp": 0.01380988, + "balance_loss_clip": 1.18357337, + "balance_loss_mlp": 1.04834616, + "epoch": 0.2208861901040101, + "flos": 26759266633440.0, + "grad_norm": 7.925427960795263, + "language_loss": 0.83970588, + "learning_rate": 3.6299456792115575e-06, + "loss": 0.86915046, + "num_input_tokens_seen": 39087590, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 3.328125, + "step": 1837, + "time_per_iteration": 3.0122628211975098 + }, + { + "auxiliary_loss_clip": 0.0156297, + "auxiliary_loss_mlp": 0.01369867, + "balance_loss_clip": 1.18121767, + "balance_loss_mlp": 1.04065895, + "epoch": 0.2210064329946492, + "flos": 17819699703840.0, + "grad_norm": 1.9869443156362334, + "language_loss": 0.81305248, + "learning_rate": 3.629494142180815e-06, + "loss": 0.84238082, + "num_input_tokens_seen": 39106335, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 3.29296875, + "step": 1838, + "time_per_iteration": 3.7301950454711914 + }, + { + "auxiliary_loss_clip": 0.01559793, + "auxiliary_loss_mlp": 0.01360414, + "balance_loss_clip": 1.17908311, + "balance_loss_mlp": 1.03406692, + "epoch": 0.22112667588528828, + "flos": 17969327752320.0, + "grad_norm": 2.043354704665388, + "language_loss": 0.85208094, + "learning_rate": 3.6290423579571075e-06, + "loss": 0.88128304, + "num_input_tokens_seen": 39122875, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 3.26367188, + "step": 1839, + "time_per_iteration": 3.0636773109436035 + }, + { + "auxiliary_loss_clip": 0.01565513, + "auxiliary_loss_mlp": 0.01363, + "balance_loss_clip": 1.18627048, + "balance_loss_mlp": 1.03512657, + "epoch": 0.22124691877592736, + "flos": 18371180615040.0, + "grad_norm": 1.7733926972984408, + "language_loss": 0.81026047, + "learning_rate": 3.6285903266089694e-06, + "loss": 0.83954561, + "num_input_tokens_seen": 39142150, + "router_z_loss_clip": 3.796875, + "router_z_loss_mlp": 3.27929688, + "step": 1840, + "time_per_iteration": 3.11215877532959 + }, + { + "auxiliary_loss_clip": 0.01557446, + "auxiliary_loss_mlp": 0.01379255, + "balance_loss_clip": 1.17791593, + "balance_loss_mlp": 1.05023694, + "epoch": 0.22136716166656648, + "flos": 20815371511200.0, + "grad_norm": 1.9085140937728384, + "language_loss": 0.77573991, + "learning_rate": 3.628138048204974e-06, + "loss": 0.80510688, + "num_input_tokens_seen": 39162835, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 3.29101562, + "step": 1841, + "time_per_iteration": 3.015996217727661 + }, + { + "auxiliary_loss_clip": 0.01561177, + "auxiliary_loss_mlp": 0.01372829, + "balance_loss_clip": 1.18008149, + "balance_loss_mlp": 1.04266667, + "epoch": 0.22148740455720556, + "flos": 17677619359200.0, + "grad_norm": 1.9494264022711736, + "language_loss": 0.76280373, + "learning_rate": 3.6276855228137304e-06, + "loss": 0.79214382, + "num_input_tokens_seen": 39181040, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.30273438, + "step": 1842, + "time_per_iteration": 3.122015953063965 + }, + { + "auxiliary_loss_clip": 0.01560589, + "auxiliary_loss_mlp": 0.01376889, + "balance_loss_clip": 1.18100178, + "balance_loss_mlp": 1.04729891, + "epoch": 0.22160764744784464, + "flos": 21728918280960.0, + "grad_norm": 4.702780711864801, + "language_loss": 0.82320863, + "learning_rate": 3.6272327505038874e-06, + "loss": 0.85258341, + "num_input_tokens_seen": 39197505, + "router_z_loss_clip": 3.796875, + "router_z_loss_mlp": 3.296875, + "step": 1843, + "time_per_iteration": 2.9446959495544434 + }, + { + "auxiliary_loss_clip": 0.01553284, + "auxiliary_loss_mlp": 0.01370538, + "balance_loss_clip": 1.17334747, + "balance_loss_mlp": 1.04209292, + "epoch": 0.22172789033848372, + "flos": 23766742863360.0, + "grad_norm": 2.0850420943023074, + "language_loss": 0.78570855, + "learning_rate": 3.626779731344131e-06, + "loss": 0.81494683, + "num_input_tokens_seen": 39217295, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 3.28515625, + "step": 1844, + "time_per_iteration": 3.104642629623413 + }, + { + "auxiliary_loss_clip": 0.01558912, + "auxiliary_loss_mlp": 0.01369181, + "balance_loss_clip": 1.17772818, + "balance_loss_mlp": 1.04111755, + "epoch": 0.22184813322912283, + "flos": 16984171887840.0, + "grad_norm": 3.009569791237984, + "language_loss": 0.85192049, + "learning_rate": 3.6263264654031814e-06, + "loss": 0.88120139, + "num_input_tokens_seen": 39234195, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 3.28125, + "step": 1845, + "time_per_iteration": 2.967120409011841 + }, + { + "auxiliary_loss_clip": 0.01684286, + "auxiliary_loss_mlp": 0.01396164, + "balance_loss_clip": 1.30283773, + "balance_loss_mlp": 1.11463928, + "epoch": 0.22196837611976192, + "flos": 61830192949920.0, + "grad_norm": 0.7497395412684333, + "language_loss": 0.59045804, + "learning_rate": 3.6258729527498008e-06, + "loss": 0.62126255, + "num_input_tokens_seen": 39295040, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 2.8203125, + "step": 1846, + "time_per_iteration": 3.4017488956451416 + }, + { + "auxiliary_loss_clip": 0.01563047, + "auxiliary_loss_mlp": 0.01371308, + "balance_loss_clip": 1.18252397, + "balance_loss_mlp": 1.04381645, + "epoch": 0.222088619010401, + "flos": 25560459257760.0, + "grad_norm": 2.348662120707422, + "language_loss": 0.64685172, + "learning_rate": 3.6254191934527854e-06, + "loss": 0.67619526, + "num_input_tokens_seen": 39314395, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 3.27539062, + "step": 1847, + "time_per_iteration": 3.05020809173584 + }, + { + "auxiliary_loss_clip": 0.01567851, + "auxiliary_loss_mlp": 0.01380134, + "balance_loss_clip": 1.18758535, + "balance_loss_mlp": 1.0520699, + "epoch": 0.2222088619010401, + "flos": 19320683644800.0, + "grad_norm": 2.0112200770680118, + "language_loss": 0.64919162, + "learning_rate": 3.6249651875809715e-06, + "loss": 0.67867142, + "num_input_tokens_seen": 39334275, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 3.28125, + "step": 1848, + "time_per_iteration": 3.052779197692871 + }, + { + "auxiliary_loss_clip": 0.01565323, + "auxiliary_loss_mlp": 0.01372787, + "balance_loss_clip": 1.18474197, + "balance_loss_mlp": 1.04815674, + "epoch": 0.2223291047916792, + "flos": 19101380837760.0, + "grad_norm": 2.0264945912628796, + "language_loss": 0.89371443, + "learning_rate": 3.62451093520323e-06, + "loss": 0.92309558, + "num_input_tokens_seen": 39352180, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.24609375, + "step": 1849, + "time_per_iteration": 3.0401172637939453 + }, + { + "auxiliary_loss_clip": 0.01565624, + "auxiliary_loss_mlp": 0.01372519, + "balance_loss_clip": 1.18342173, + "balance_loss_mlp": 1.04750633, + "epoch": 0.22244934768231828, + "flos": 20852541472320.0, + "grad_norm": 2.1277582749337007, + "language_loss": 0.90924883, + "learning_rate": 3.6240564363884714e-06, + "loss": 0.93863016, + "num_input_tokens_seen": 39372125, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 3.25, + "step": 1850, + "time_per_iteration": 2.9833827018737793 + }, + { + "auxiliary_loss_clip": 0.01560382, + "auxiliary_loss_mlp": 0.01379415, + "balance_loss_clip": 1.17860687, + "balance_loss_mlp": 1.05230522, + "epoch": 0.2225695905729574, + "flos": 15634750331520.0, + "grad_norm": 1.7803274461109173, + "language_loss": 0.70613718, + "learning_rate": 3.623601691205643e-06, + "loss": 0.73553514, + "num_input_tokens_seen": 39391200, + "router_z_loss_clip": 3.81835938, + "router_z_loss_mlp": 3.27148438, + "step": 1851, + "time_per_iteration": 2.91556453704834 + }, + { + "auxiliary_loss_clip": 0.01562942, + "auxiliary_loss_mlp": 0.01389711, + "balance_loss_clip": 1.18154287, + "balance_loss_mlp": 1.0667969, + "epoch": 0.22268983346359647, + "flos": 25375899009600.0, + "grad_norm": 1.9917731868871558, + "language_loss": 0.81547379, + "learning_rate": 3.623146699723729e-06, + "loss": 0.84500033, + "num_input_tokens_seen": 39410660, + "router_z_loss_clip": 3.81835938, + "router_z_loss_mlp": 3.22851562, + "step": 1852, + "time_per_iteration": 3.0383665561676025 + }, + { + "auxiliary_loss_clip": 0.01569641, + "auxiliary_loss_mlp": 0.01381631, + "balance_loss_clip": 1.18744707, + "balance_loss_mlp": 1.05547452, + "epoch": 0.22281007635423555, + "flos": 13263154662240.0, + "grad_norm": 1.891147425957581, + "language_loss": 0.78025389, + "learning_rate": 3.6226914620117507e-06, + "loss": 0.80976659, + "num_input_tokens_seen": 39429280, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 3.26171875, + "step": 1853, + "time_per_iteration": 3.1048219203948975 + }, + { + "auxiliary_loss_clip": 0.01554445, + "auxiliary_loss_mlp": 0.01366066, + "balance_loss_clip": 1.1742909, + "balance_loss_mlp": 1.0393368, + "epoch": 0.22293031924487464, + "flos": 15342321303360.0, + "grad_norm": 1.924001951601937, + "language_loss": 0.81064904, + "learning_rate": 3.622235978138768e-06, + "loss": 0.83985412, + "num_input_tokens_seen": 39446905, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 3.26757812, + "step": 1854, + "time_per_iteration": 2.971285581588745 + }, + { + "auxiliary_loss_clip": 0.01560485, + "auxiliary_loss_mlp": 0.01369925, + "balance_loss_clip": 1.18029356, + "balance_loss_mlp": 1.04434013, + "epoch": 0.22305056213551375, + "flos": 22566835571040.0, + "grad_norm": 1.7720952149231546, + "language_loss": 0.81608903, + "learning_rate": 3.621780248173877e-06, + "loss": 0.84539318, + "num_input_tokens_seen": 39465105, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.25585938, + "step": 1855, + "time_per_iteration": 3.0404348373413086 + }, + { + "auxiliary_loss_clip": 0.01679562, + "auxiliary_loss_mlp": 0.01282799, + "balance_loss_clip": 1.29858661, + "balance_loss_mlp": 1.01195526, + "epoch": 0.22317080502615283, + "flos": 64886588470080.0, + "grad_norm": 0.8617741420853395, + "language_loss": 0.61086959, + "learning_rate": 3.6213242721862125e-06, + "loss": 0.64049321, + "num_input_tokens_seen": 39523560, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 2.7109375, + "step": 1856, + "time_per_iteration": 4.360265016555786 + }, + { + "auxiliary_loss_clip": 0.01561191, + "auxiliary_loss_mlp": 0.01364548, + "balance_loss_clip": 1.18038046, + "balance_loss_mlp": 1.03972673, + "epoch": 0.2232910479167919, + "flos": 25778396651040.0, + "grad_norm": 1.5819419653575453, + "language_loss": 0.75421339, + "learning_rate": 3.620868050244945e-06, + "loss": 0.78347075, + "num_input_tokens_seen": 39544040, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.24804688, + "step": 1857, + "time_per_iteration": 3.033059597015381 + }, + { + "auxiliary_loss_clip": 0.01557, + "auxiliary_loss_mlp": 0.01374747, + "balance_loss_clip": 1.17794633, + "balance_loss_mlp": 1.04363096, + "epoch": 0.22341129080743102, + "flos": 23253683542560.0, + "grad_norm": 2.015843823942842, + "language_loss": 0.77945721, + "learning_rate": 3.6204115824192817e-06, + "loss": 0.80877471, + "num_input_tokens_seen": 39561515, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.3125, + "step": 1858, + "time_per_iteration": 3.9365651607513428 + }, + { + "auxiliary_loss_clip": 0.01558935, + "auxiliary_loss_mlp": 0.01360092, + "balance_loss_clip": 1.17817521, + "balance_loss_mlp": 1.03221893, + "epoch": 0.2235315336980701, + "flos": 21216655451520.0, + "grad_norm": 2.2419745112405627, + "language_loss": 0.76821977, + "learning_rate": 3.619954868778471e-06, + "loss": 0.79741001, + "num_input_tokens_seen": 39578210, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.27929688, + "step": 1859, + "time_per_iteration": 3.122380018234253 + }, + { + "auxiliary_loss_clip": 0.01556594, + "auxiliary_loss_mlp": 0.01363946, + "balance_loss_clip": 1.17764854, + "balance_loss_mlp": 1.03836215, + "epoch": 0.2236517765887092, + "flos": 19904024574720.0, + "grad_norm": 2.419733290336824, + "language_loss": 0.83400464, + "learning_rate": 3.6194979093917944e-06, + "loss": 0.86321008, + "num_input_tokens_seen": 39597625, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 3.25585938, + "step": 1860, + "time_per_iteration": 3.022608518600464 + }, + { + "auxiliary_loss_clip": 0.0155761, + "auxiliary_loss_mlp": 0.01369659, + "balance_loss_clip": 1.178828, + "balance_loss_mlp": 1.04121363, + "epoch": 0.22377201947934827, + "flos": 23216703222240.0, + "grad_norm": 2.1038556077220814, + "language_loss": 0.87369859, + "learning_rate": 3.6190407043285724e-06, + "loss": 0.90297139, + "num_input_tokens_seen": 39615360, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.28515625, + "step": 1861, + "time_per_iteration": 3.0644495487213135 + }, + { + "auxiliary_loss_clip": 0.01552661, + "auxiliary_loss_mlp": 0.01367978, + "balance_loss_clip": 1.1719718, + "balance_loss_mlp": 1.04029584, + "epoch": 0.22389226236998738, + "flos": 26796512450880.0, + "grad_norm": 1.7996315575197523, + "language_loss": 0.7580862, + "learning_rate": 3.618583253658163e-06, + "loss": 0.7872926, + "num_input_tokens_seen": 39635460, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 3.27734375, + "step": 1862, + "time_per_iteration": 3.9011590480804443 + }, + { + "auxiliary_loss_clip": 0.01560537, + "auxiliary_loss_mlp": 0.01364654, + "balance_loss_clip": 1.18030322, + "balance_loss_mlp": 1.03639948, + "epoch": 0.22401250526062647, + "flos": 24172654039200.0, + "grad_norm": 2.0585368634155263, + "language_loss": 0.86623204, + "learning_rate": 3.618125557449961e-06, + "loss": 0.89548391, + "num_input_tokens_seen": 39653515, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 3.28320312, + "step": 1863, + "time_per_iteration": 3.02634334564209 + }, + { + "auxiliary_loss_clip": 0.01549305, + "auxiliary_loss_mlp": 0.01371911, + "balance_loss_clip": 1.17003798, + "balance_loss_mlp": 1.0419395, + "epoch": 0.22413274815126555, + "flos": 16761417618240.0, + "grad_norm": 2.0211852948445057, + "language_loss": 0.83176285, + "learning_rate": 3.6176676157733983e-06, + "loss": 0.86097503, + "num_input_tokens_seen": 39668525, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 3.30078125, + "step": 1864, + "time_per_iteration": 3.2191548347473145 + }, + { + "auxiliary_loss_clip": 0.01553024, + "auxiliary_loss_mlp": 0.01366686, + "balance_loss_clip": 1.17462862, + "balance_loss_mlp": 1.03957558, + "epoch": 0.22425299104190466, + "flos": 21362301043200.0, + "grad_norm": 2.2917407828067815, + "language_loss": 0.76357639, + "learning_rate": 3.6172094286979443e-06, + "loss": 0.79277349, + "num_input_tokens_seen": 39685895, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 3.27148438, + "step": 1865, + "time_per_iteration": 3.78869366645813 + }, + { + "auxiliary_loss_clip": 0.0154852, + "auxiliary_loss_mlp": 0.0136812, + "balance_loss_clip": 1.16906512, + "balance_loss_mlp": 1.04062843, + "epoch": 0.22437323393254374, + "flos": 32168558875680.0, + "grad_norm": 1.4174635138497096, + "language_loss": 0.81680846, + "learning_rate": 3.6167509962931064e-06, + "loss": 0.84597486, + "num_input_tokens_seen": 39711595, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 3.27539062, + "step": 1866, + "time_per_iteration": 3.033804416656494 + }, + { + "auxiliary_loss_clip": 0.01551917, + "auxiliary_loss_mlp": 0.01374899, + "balance_loss_clip": 1.17287433, + "balance_loss_mlp": 1.04797935, + "epoch": 0.22449347682318282, + "flos": 18004677161760.0, + "grad_norm": 3.2763297529835866, + "language_loss": 0.76841885, + "learning_rate": 3.6162923186284276e-06, + "loss": 0.79768705, + "num_input_tokens_seen": 39727555, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 3.26953125, + "step": 1867, + "time_per_iteration": 2.9542999267578125 + }, + { + "auxiliary_loss_clip": 0.01562289, + "auxiliary_loss_mlp": 0.01377537, + "balance_loss_clip": 1.18282938, + "balance_loss_mlp": 1.05004501, + "epoch": 0.2246137197138219, + "flos": 18699262477920.0, + "grad_norm": 2.1211851567160074, + "language_loss": 0.85881907, + "learning_rate": 3.6158333957734888e-06, + "loss": 0.88821733, + "num_input_tokens_seen": 39746145, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 3.27539062, + "step": 1868, + "time_per_iteration": 3.0076093673706055 + }, + { + "auxiliary_loss_clip": 0.01558723, + "auxiliary_loss_mlp": 0.01362715, + "balance_loss_clip": 1.17691386, + "balance_loss_mlp": 1.03732109, + "epoch": 0.22473396260446102, + "flos": 15592156643520.0, + "grad_norm": 2.4332883849456914, + "language_loss": 0.82716757, + "learning_rate": 3.6153742277979088e-06, + "loss": 0.85638189, + "num_input_tokens_seen": 39763575, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 3.25390625, + "step": 1869, + "time_per_iteration": 2.9707069396972656 + }, + { + "auxiliary_loss_clip": 0.0155512, + "auxiliary_loss_mlp": 0.01365414, + "balance_loss_clip": 1.17632067, + "balance_loss_mlp": 1.04192722, + "epoch": 0.2248542054951001, + "flos": 14467271980320.0, + "grad_norm": 2.1352727236656532, + "language_loss": 0.78278017, + "learning_rate": 3.6149148147713434e-06, + "loss": 0.81198549, + "num_input_tokens_seen": 39781810, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 3.234375, + "step": 1870, + "time_per_iteration": 3.1595213413238525 + }, + { + "auxiliary_loss_clip": 0.01561134, + "auxiliary_loss_mlp": 0.0137381, + "balance_loss_clip": 1.1803782, + "balance_loss_mlp": 1.04803467, + "epoch": 0.22497444838573918, + "flos": 19245850656480.0, + "grad_norm": 4.019642726030632, + "language_loss": 0.86951113, + "learning_rate": 3.614455156763484e-06, + "loss": 0.89886057, + "num_input_tokens_seen": 39800115, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.2578125, + "step": 1871, + "time_per_iteration": 2.9227521419525146 + }, + { + "auxiliary_loss_clip": 0.01556498, + "auxiliary_loss_mlp": 0.013665, + "balance_loss_clip": 1.17647612, + "balance_loss_mlp": 1.03900874, + "epoch": 0.2250946912763783, + "flos": 16912221439680.0, + "grad_norm": 2.099293190017939, + "language_loss": 0.71437371, + "learning_rate": 3.613995253844061e-06, + "loss": 0.74360371, + "num_input_tokens_seen": 39817795, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 3.27539062, + "step": 1872, + "time_per_iteration": 3.0280139446258545 + }, + { + "auxiliary_loss_clip": 0.01559941, + "auxiliary_loss_mlp": 0.01360826, + "balance_loss_clip": 1.18075609, + "balance_loss_mlp": 1.03486037, + "epoch": 0.22521493416701738, + "flos": 24683703167520.0, + "grad_norm": 1.906478793670292, + "language_loss": 0.80866945, + "learning_rate": 3.6135351060828414e-06, + "loss": 0.83787715, + "num_input_tokens_seen": 39838270, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 3.25976562, + "step": 1873, + "time_per_iteration": 3.032472610473633 + }, + { + "auxiliary_loss_clip": 0.0155931, + "auxiliary_loss_mlp": 0.01351962, + "balance_loss_clip": 1.1808989, + "balance_loss_mlp": 1.02485168, + "epoch": 0.22533517705765646, + "flos": 17823909729600.0, + "grad_norm": 2.375925013078044, + "language_loss": 0.691728, + "learning_rate": 3.6130747135496285e-06, + "loss": 0.72084075, + "num_input_tokens_seen": 39857270, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 3.27148438, + "step": 1874, + "time_per_iteration": 3.062706470489502 + }, + { + "auxiliary_loss_clip": 0.01559836, + "auxiliary_loss_mlp": 0.01363418, + "balance_loss_clip": 1.18067789, + "balance_loss_mlp": 1.03840601, + "epoch": 0.22545541994829554, + "flos": 33694120628640.0, + "grad_norm": 2.230701254258506, + "language_loss": 0.65996408, + "learning_rate": 3.6126140763142646e-06, + "loss": 0.68919659, + "num_input_tokens_seen": 39882300, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 3.25, + "step": 1875, + "time_per_iteration": 3.171071767807007 + }, + { + "auxiliary_loss_clip": 0.01560167, + "auxiliary_loss_mlp": 0.01358552, + "balance_loss_clip": 1.18226075, + "balance_loss_mlp": 1.0331583, + "epoch": 0.22557566283893465, + "flos": 19173596783040.0, + "grad_norm": 2.6875987314126353, + "language_loss": 0.86094266, + "learning_rate": 3.6121531944466275e-06, + "loss": 0.8901298, + "num_input_tokens_seen": 39899625, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.25390625, + "step": 1876, + "time_per_iteration": 3.0566866397857666 + }, + { + "auxiliary_loss_clip": 0.01559725, + "auxiliary_loss_mlp": 0.01355254, + "balance_loss_clip": 1.18081045, + "balance_loss_mlp": 1.02928782, + "epoch": 0.22569590572957374, + "flos": 20775167297280.0, + "grad_norm": 2.7530043680436487, + "language_loss": 0.78503883, + "learning_rate": 3.611692068016633e-06, + "loss": 0.81418854, + "num_input_tokens_seen": 39915955, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.25976562, + "step": 1877, + "time_per_iteration": 3.0123422145843506 + }, + { + "auxiliary_loss_clip": 0.01560663, + "auxiliary_loss_mlp": 0.01362792, + "balance_loss_clip": 1.18175542, + "balance_loss_mlp": 1.03758895, + "epoch": 0.22581614862021282, + "flos": 18444724045920.0, + "grad_norm": 3.998468445984982, + "language_loss": 0.7498188, + "learning_rate": 3.611230697094233e-06, + "loss": 0.77905333, + "num_input_tokens_seen": 39932655, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 3.25195312, + "step": 1878, + "time_per_iteration": 3.0762507915496826 + }, + { + "auxiliary_loss_clip": 0.01557937, + "auxiliary_loss_mlp": 0.01364101, + "balance_loss_clip": 1.17951226, + "balance_loss_mlp": 1.04328537, + "epoch": 0.22593639151085193, + "flos": 20050504585920.0, + "grad_norm": 3.215944707118269, + "language_loss": 0.87584138, + "learning_rate": 3.6107690817494173e-06, + "loss": 0.90506184, + "num_input_tokens_seen": 39952875, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.20703125, + "step": 1879, + "time_per_iteration": 3.0701870918273926 + }, + { + "auxiliary_loss_clip": 0.01552628, + "auxiliary_loss_mlp": 0.0137057, + "balance_loss_clip": 1.1757499, + "balance_loss_mlp": 1.046893, + "epoch": 0.226056634401491, + "flos": 13116788435520.0, + "grad_norm": 2.543051204827413, + "language_loss": 0.70950663, + "learning_rate": 3.6103072220522117e-06, + "loss": 0.7387386, + "num_input_tokens_seen": 39968405, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.23632812, + "step": 1880, + "time_per_iteration": 3.302638053894043 + }, + { + "auxiliary_loss_clip": 0.01561072, + "auxiliary_loss_mlp": 0.01378584, + "balance_loss_clip": 1.1827879, + "balance_loss_mlp": 1.05376267, + "epoch": 0.2261768772921301, + "flos": 18990326092320.0, + "grad_norm": 1.8894199906187437, + "language_loss": 0.91951144, + "learning_rate": 3.609845118072682e-06, + "loss": 0.94890803, + "num_input_tokens_seen": 39987075, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.24804688, + "step": 1881, + "time_per_iteration": 2.9766669273376465 + }, + { + "auxiliary_loss_clip": 0.0155917, + "auxiliary_loss_mlp": 0.01369469, + "balance_loss_clip": 1.18115807, + "balance_loss_mlp": 1.04350281, + "epoch": 0.2262971201827692, + "flos": 19976088807360.0, + "grad_norm": 1.8729614583660004, + "language_loss": 0.80204809, + "learning_rate": 3.6093827698809276e-06, + "loss": 0.83133441, + "num_input_tokens_seen": 40006175, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.25976562, + "step": 1882, + "time_per_iteration": 3.027226686477661 + }, + { + "auxiliary_loss_clip": 0.01557783, + "auxiliary_loss_mlp": 0.01358294, + "balance_loss_clip": 1.180902, + "balance_loss_mlp": 1.03404427, + "epoch": 0.2264173630734083, + "flos": 16656431378400.0, + "grad_norm": 2.4231165243727197, + "language_loss": 0.85017657, + "learning_rate": 3.6089201775470864e-06, + "loss": 0.87933737, + "num_input_tokens_seen": 40021630, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.2421875, + "step": 1883, + "time_per_iteration": 3.021926164627075 + }, + { + "auxiliary_loss_clip": 0.01558581, + "auxiliary_loss_mlp": 0.01367774, + "balance_loss_clip": 1.18168235, + "balance_loss_mlp": 1.0450511, + "epoch": 0.22653760596404737, + "flos": 24392146487040.0, + "grad_norm": 1.4958568026029397, + "language_loss": 0.77917278, + "learning_rate": 3.6084573411413334e-06, + "loss": 0.80843627, + "num_input_tokens_seen": 40041025, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.2265625, + "step": 1884, + "time_per_iteration": 3.8764469623565674 + }, + { + "auxiliary_loss_clip": 0.01558647, + "auxiliary_loss_mlp": 0.01375655, + "balance_loss_clip": 1.18224776, + "balance_loss_mlp": 1.05064237, + "epoch": 0.22665784885468646, + "flos": 18334503720000.0, + "grad_norm": 2.27133345089345, + "language_loss": 0.81052756, + "learning_rate": 3.607994260733881e-06, + "loss": 0.83987051, + "num_input_tokens_seen": 40060265, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.25, + "step": 1885, + "time_per_iteration": 3.0358307361602783 + }, + { + "auxiliary_loss_clip": 0.01558365, + "auxiliary_loss_mlp": 0.01371194, + "balance_loss_clip": 1.18112552, + "balance_loss_mlp": 1.04637289, + "epoch": 0.22677809174532557, + "flos": 24060233880000.0, + "grad_norm": 1.725727444833576, + "language_loss": 0.74718916, + "learning_rate": 3.6075309363949776e-06, + "loss": 0.77648479, + "num_input_tokens_seen": 40079435, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.24804688, + "step": 1886, + "time_per_iteration": 3.9231667518615723 + }, + { + "auxiliary_loss_clip": 0.01558187, + "auxiliary_loss_mlp": 0.01366563, + "balance_loss_clip": 1.1797235, + "balance_loss_mlp": 1.04441142, + "epoch": 0.22689833463596465, + "flos": 20376234902880.0, + "grad_norm": 2.0393316967747883, + "language_loss": 0.81464827, + "learning_rate": 3.6070673681949094e-06, + "loss": 0.84389573, + "num_input_tokens_seen": 40097800, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.22070312, + "step": 1887, + "time_per_iteration": 3.031370162963867 + }, + { + "auxiliary_loss_clip": 0.01562012, + "auxiliary_loss_mlp": 0.01356898, + "balance_loss_clip": 1.18308902, + "balance_loss_mlp": 1.03550959, + "epoch": 0.22701857752660373, + "flos": 30123186589440.0, + "grad_norm": 1.7329057830948507, + "language_loss": 0.81255496, + "learning_rate": 3.606603556203999e-06, + "loss": 0.84174412, + "num_input_tokens_seen": 40122745, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 3.21289062, + "step": 1888, + "time_per_iteration": 3.123664617538452 + }, + { + "auxiliary_loss_clip": 0.01559974, + "auxiliary_loss_mlp": 0.01352966, + "balance_loss_clip": 1.18156171, + "balance_loss_mlp": 1.03215027, + "epoch": 0.22713882041724284, + "flos": 22494392056800.0, + "grad_norm": 1.9517861758463122, + "language_loss": 0.83572137, + "learning_rate": 3.6061395004926066e-06, + "loss": 0.86485076, + "num_input_tokens_seen": 40141680, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.20703125, + "step": 1889, + "time_per_iteration": 3.851292848587036 + }, + { + "auxiliary_loss_clip": 0.01558315, + "auxiliary_loss_mlp": 0.0135593, + "balance_loss_clip": 1.18069506, + "balance_loss_mlp": 1.03244328, + "epoch": 0.22725906330788193, + "flos": 20523625189920.0, + "grad_norm": 2.7648201094951177, + "language_loss": 0.85064542, + "learning_rate": 3.605675201131129e-06, + "loss": 0.87978786, + "num_input_tokens_seen": 40160140, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.234375, + "step": 1890, + "time_per_iteration": 2.9664580821990967 + }, + { + "auxiliary_loss_clip": 0.01561483, + "auxiliary_loss_mlp": 0.01352243, + "balance_loss_clip": 1.18349409, + "balance_loss_mlp": 1.02856565, + "epoch": 0.227379306198521, + "flos": 18991957003200.0, + "grad_norm": 4.620575983376999, + "language_loss": 0.79698491, + "learning_rate": 3.60521065819e-06, + "loss": 0.82612216, + "num_input_tokens_seen": 40177450, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 3.23632812, + "step": 1891, + "time_per_iteration": 2.9868602752685547 + }, + { + "auxiliary_loss_clip": 0.01553428, + "auxiliary_loss_mlp": 0.01344766, + "balance_loss_clip": 1.17443204, + "balance_loss_mlp": 1.02394986, + "epoch": 0.2274995490891601, + "flos": 21800489447520.0, + "grad_norm": 2.821330296780895, + "language_loss": 0.87532133, + "learning_rate": 3.60474587173969e-06, + "loss": 0.90430331, + "num_input_tokens_seen": 40195935, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 3.20703125, + "step": 1892, + "time_per_iteration": 2.958244562149048 + }, + { + "auxiliary_loss_clip": 0.0156635, + "auxiliary_loss_mlp": 0.0136189, + "balance_loss_clip": 1.1890502, + "balance_loss_mlp": 1.03764117, + "epoch": 0.2276197919797992, + "flos": 19060455988800.0, + "grad_norm": 2.638864488463889, + "language_loss": 0.84116375, + "learning_rate": 3.6042808418507084e-06, + "loss": 0.87044609, + "num_input_tokens_seen": 40213620, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.2421875, + "step": 1893, + "time_per_iteration": 3.698200225830078 + }, + { + "auxiliary_loss_clip": 0.0155797, + "auxiliary_loss_mlp": 0.01370347, + "balance_loss_clip": 1.18164122, + "balance_loss_mlp": 1.04609799, + "epoch": 0.22774003487043828, + "flos": 18808762168800.0, + "grad_norm": 1.8810285984565847, + "language_loss": 0.76787937, + "learning_rate": 3.6038155685935976e-06, + "loss": 0.79716259, + "num_input_tokens_seen": 40230190, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.2421875, + "step": 1894, + "time_per_iteration": 2.9741055965423584 + }, + { + "auxiliary_loss_clip": 0.01557775, + "auxiliary_loss_mlp": 0.01361759, + "balance_loss_clip": 1.1791997, + "balance_loss_mlp": 1.03731883, + "epoch": 0.22786027776107737, + "flos": 23004758478240.0, + "grad_norm": 4.89279638260188, + "language_loss": 0.7065649, + "learning_rate": 3.6033500520389404e-06, + "loss": 0.73576021, + "num_input_tokens_seen": 40246860, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.24414062, + "step": 1895, + "time_per_iteration": 2.9951229095458984 + }, + { + "auxiliary_loss_clip": 0.01696404, + "auxiliary_loss_mlp": 0.01274277, + "balance_loss_clip": 1.31667185, + "balance_loss_mlp": 1.00572205, + "epoch": 0.22798052065171648, + "flos": 66713264799840.0, + "grad_norm": 0.8022355827801518, + "language_loss": 0.64791977, + "learning_rate": 3.6028842922573553e-06, + "loss": 0.67762649, + "num_input_tokens_seen": 40311005, + "router_z_loss_clip": 3.796875, + "router_z_loss_mlp": 2.6875, + "step": 1896, + "time_per_iteration": 3.5154647827148438 + }, + { + "auxiliary_loss_clip": 0.01692686, + "auxiliary_loss_mlp": 0.01260773, + "balance_loss_clip": 1.31315231, + "balance_loss_mlp": 0.99603271, + "epoch": 0.22810076354235556, + "flos": 62087462209440.0, + "grad_norm": 0.8736031741437248, + "language_loss": 0.62877858, + "learning_rate": 3.602418289319497e-06, + "loss": 0.65831316, + "num_input_tokens_seen": 40369560, + "router_z_loss_clip": 3.796875, + "router_z_loss_mlp": 2.6484375, + "step": 1897, + "time_per_iteration": 3.423931121826172 + }, + { + "auxiliary_loss_clip": 0.01563061, + "auxiliary_loss_mlp": 0.0135916, + "balance_loss_clip": 1.18460202, + "balance_loss_mlp": 1.03529203, + "epoch": 0.22822100643299464, + "flos": 23878518243840.0, + "grad_norm": 2.032336628771338, + "language_loss": 0.73386729, + "learning_rate": 3.601952043296059e-06, + "loss": 0.76308954, + "num_input_tokens_seen": 40389555, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.23828125, + "step": 1898, + "time_per_iteration": 3.071908950805664 + }, + { + "auxiliary_loss_clip": 0.01562526, + "auxiliary_loss_mlp": 0.01360852, + "balance_loss_clip": 1.18338704, + "balance_loss_mlp": 1.03526735, + "epoch": 0.22834124932363373, + "flos": 20993180546880.0, + "grad_norm": 1.9752437845568267, + "language_loss": 0.80800653, + "learning_rate": 3.6014855542577696e-06, + "loss": 0.83724028, + "num_input_tokens_seen": 40406765, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 3.25585938, + "step": 1899, + "time_per_iteration": 2.9317567348480225 + }, + { + "auxiliary_loss_clip": 0.01563185, + "auxiliary_loss_mlp": 0.01364211, + "balance_loss_clip": 1.18579197, + "balance_loss_mlp": 1.04225039, + "epoch": 0.22846149221427284, + "flos": 24903992106720.0, + "grad_norm": 2.2627518101513675, + "language_loss": 0.84415042, + "learning_rate": 3.6010188222753943e-06, + "loss": 0.87342441, + "num_input_tokens_seen": 40427535, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.21875, + "step": 1900, + "time_per_iteration": 3.094080924987793 + }, + { + "auxiliary_loss_clip": 0.01671195, + "auxiliary_loss_mlp": 0.01266594, + "balance_loss_clip": 1.28846049, + "balance_loss_mlp": 1.00337982, + "epoch": 0.22858173510491192, + "flos": 56138446785600.0, + "grad_norm": 1.0174197090820265, + "language_loss": 0.64163673, + "learning_rate": 3.6005518474197372e-06, + "loss": 0.67101467, + "num_input_tokens_seen": 40479580, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 2.6328125, + "step": 1901, + "time_per_iteration": 3.253082036972046 + }, + { + "auxiliary_loss_clip": 0.01563485, + "auxiliary_loss_mlp": 0.01373583, + "balance_loss_clip": 1.18289828, + "balance_loss_mlp": 1.04990613, + "epoch": 0.228701977995551, + "flos": 24173147105280.0, + "grad_norm": 2.184523070647183, + "language_loss": 0.78464341, + "learning_rate": 3.6000846297616373e-06, + "loss": 0.81401402, + "num_input_tokens_seen": 40497880, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 3.23632812, + "step": 1902, + "time_per_iteration": 3.0604984760284424 + }, + { + "auxiliary_loss_clip": 0.01566796, + "auxiliary_loss_mlp": 0.01362754, + "balance_loss_clip": 1.1872102, + "balance_loss_mlp": 1.03793228, + "epoch": 0.22882222088619011, + "flos": 21389495898240.0, + "grad_norm": 2.334053797324172, + "language_loss": 0.72777408, + "learning_rate": 3.5996171693719717e-06, + "loss": 0.75706959, + "num_input_tokens_seen": 40513975, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 3.24804688, + "step": 1903, + "time_per_iteration": 2.9790170192718506 + }, + { + "auxiliary_loss_clip": 0.01656283, + "auxiliary_loss_mlp": 0.01259392, + "balance_loss_clip": 1.27230549, + "balance_loss_mlp": 0.99999237, + "epoch": 0.2289424637768292, + "flos": 64595828280960.0, + "grad_norm": 0.8401641375127369, + "language_loss": 0.64845341, + "learning_rate": 3.5991494663216528e-06, + "loss": 0.67761016, + "num_input_tokens_seen": 40576960, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 2.59375, + "step": 1904, + "time_per_iteration": 3.4095451831817627 + }, + { + "auxiliary_loss_clip": 0.01561945, + "auxiliary_loss_mlp": 0.01360994, + "balance_loss_clip": 1.18231905, + "balance_loss_mlp": 1.03846133, + "epoch": 0.22906270666746828, + "flos": 22165589558880.0, + "grad_norm": 5.339619644454513, + "language_loss": 0.8748349, + "learning_rate": 3.5986815206816314e-06, + "loss": 0.9040643, + "num_input_tokens_seen": 40595780, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 3.22460938, + "step": 1905, + "time_per_iteration": 3.0101583003997803 + }, + { + "auxiliary_loss_clip": 0.01551257, + "auxiliary_loss_mlp": 0.01365236, + "balance_loss_clip": 1.17102146, + "balance_loss_mlp": 1.0423224, + "epoch": 0.2291829495581074, + "flos": 25774527978720.0, + "grad_norm": 3.7574346873305773, + "language_loss": 0.74661684, + "learning_rate": 3.598213332522895e-06, + "loss": 0.77578175, + "num_input_tokens_seen": 40615810, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 3.22851562, + "step": 1906, + "time_per_iteration": 3.0112950801849365 + }, + { + "auxiliary_loss_clip": 0.01560777, + "auxiliary_loss_mlp": 0.01365658, + "balance_loss_clip": 1.17979789, + "balance_loss_mlp": 1.04331589, + "epoch": 0.22930319244874647, + "flos": 31175286384960.0, + "grad_norm": 1.8772375270945563, + "language_loss": 0.77351868, + "learning_rate": 3.597744901916466e-06, + "loss": 0.80278301, + "num_input_tokens_seen": 40637095, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 3.22265625, + "step": 1907, + "time_per_iteration": 3.0749852657318115 + }, + { + "auxiliary_loss_clip": 0.0155004, + "auxiliary_loss_mlp": 0.01363031, + "balance_loss_clip": 1.17005908, + "balance_loss_mlp": 1.04126167, + "epoch": 0.22942343533938556, + "flos": 23256110944800.0, + "grad_norm": 1.9476646575691996, + "language_loss": 0.76671481, + "learning_rate": 3.5972762289334058e-06, + "loss": 0.79584551, + "num_input_tokens_seen": 40656725, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 3.21679688, + "step": 1908, + "time_per_iteration": 3.1474862098693848 + }, + { + "auxiliary_loss_clip": 0.01556651, + "auxiliary_loss_mlp": 0.01365451, + "balance_loss_clip": 1.17580497, + "balance_loss_mlp": 1.04291809, + "epoch": 0.22954367823002464, + "flos": 14612652074880.0, + "grad_norm": 4.280286525404953, + "language_loss": 0.84966213, + "learning_rate": 3.5968073136448116e-06, + "loss": 0.87888312, + "num_input_tokens_seen": 40674745, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.22460938, + "step": 1909, + "time_per_iteration": 3.0138790607452393 + }, + { + "auxiliary_loss_clip": 0.0155715, + "auxiliary_loss_mlp": 0.01365207, + "balance_loss_clip": 1.17593789, + "balance_loss_mlp": 1.04172099, + "epoch": 0.22966392112066375, + "flos": 16765475931360.0, + "grad_norm": 1.8893771870363598, + "language_loss": 0.91495782, + "learning_rate": 3.596338156121818e-06, + "loss": 0.94418132, + "num_input_tokens_seen": 40693630, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 3.234375, + "step": 1910, + "time_per_iteration": 3.1009604930877686 + }, + { + "auxiliary_loss_clip": 0.01630104, + "auxiliary_loss_mlp": 0.01253883, + "balance_loss_clip": 1.24344945, + "balance_loss_mlp": 0.99600983, + "epoch": 0.22978416401130283, + "flos": 67480748768160.0, + "grad_norm": 0.7541858267247945, + "language_loss": 0.59299809, + "learning_rate": 3.595868756435595e-06, + "loss": 0.62183797, + "num_input_tokens_seen": 40761310, + "router_z_loss_clip": 3.875, + "router_z_loss_mlp": 2.578125, + "step": 1911, + "time_per_iteration": 3.590914487838745 + }, + { + "auxiliary_loss_clip": 0.01558077, + "auxiliary_loss_mlp": 0.01369625, + "balance_loss_clip": 1.17802012, + "balance_loss_mlp": 1.04804635, + "epoch": 0.22990440690194192, + "flos": 19867195967040.0, + "grad_norm": 2.948806177205054, + "language_loss": 0.80286211, + "learning_rate": 3.5953991146573504e-06, + "loss": 0.83213913, + "num_input_tokens_seen": 40779955, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 3.21484375, + "step": 1912, + "time_per_iteration": 3.9234111309051514 + }, + { + "auxiliary_loss_clip": 0.01553785, + "auxiliary_loss_mlp": 0.01351205, + "balance_loss_clip": 1.17380798, + "balance_loss_mlp": 1.02638364, + "epoch": 0.23002464979258103, + "flos": 13290463301760.0, + "grad_norm": 4.013702061447833, + "language_loss": 0.83905655, + "learning_rate": 3.5949292308583294e-06, + "loss": 0.86810648, + "num_input_tokens_seen": 40793200, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 3.24804688, + "step": 1913, + "time_per_iteration": 3.829319715499878 + }, + { + "auxiliary_loss_clip": 0.0155613, + "auxiliary_loss_mlp": 0.01358431, + "balance_loss_clip": 1.1752851, + "balance_loss_mlp": 1.04009485, + "epoch": 0.2301448926832201, + "flos": 22165968840480.0, + "grad_norm": 2.751143374695222, + "language_loss": 0.81067932, + "learning_rate": 3.594459105109811e-06, + "loss": 0.83982491, + "num_input_tokens_seen": 40812380, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 3.18164062, + "step": 1914, + "time_per_iteration": 3.05061674118042 + }, + { + "auxiliary_loss_clip": 0.01559431, + "auxiliary_loss_mlp": 0.01353207, + "balance_loss_clip": 1.17753696, + "balance_loss_mlp": 1.03181887, + "epoch": 0.2302651355738592, + "flos": 20706592455360.0, + "grad_norm": 1.8877883325159954, + "language_loss": 0.81578916, + "learning_rate": 3.593988737483115e-06, + "loss": 0.84491551, + "num_input_tokens_seen": 40832320, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 3.21289062, + "step": 1915, + "time_per_iteration": 3.0141236782073975 + }, + { + "auxiliary_loss_clip": 0.01564703, + "auxiliary_loss_mlp": 0.01357713, + "balance_loss_clip": 1.18340945, + "balance_loss_mlp": 1.0349896, + "epoch": 0.23038537846449827, + "flos": 18590976488160.0, + "grad_norm": 2.05938331160741, + "language_loss": 0.78415388, + "learning_rate": 3.5935181280495947e-06, + "loss": 0.81337804, + "num_input_tokens_seen": 40850900, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 3.2265625, + "step": 1916, + "time_per_iteration": 3.0496585369110107 + }, + { + "auxiliary_loss_clip": 0.01623382, + "auxiliary_loss_mlp": 0.01258728, + "balance_loss_clip": 1.23623967, + "balance_loss_mlp": 0.99932861, + "epoch": 0.23050562135513739, + "flos": 64230424744320.0, + "grad_norm": 0.8065526080700548, + "language_loss": 0.5418756, + "learning_rate": 3.5930472768806412e-06, + "loss": 0.57069671, + "num_input_tokens_seen": 40909570, + "router_z_loss_clip": 3.875, + "router_z_loss_mlp": 2.59375, + "step": 1917, + "time_per_iteration": 4.318268537521362 + }, + { + "auxiliary_loss_clip": 0.01563645, + "auxiliary_loss_mlp": 0.01362151, + "balance_loss_clip": 1.18183255, + "balance_loss_mlp": 1.04019046, + "epoch": 0.23062586424577647, + "flos": 17315439716160.0, + "grad_norm": 2.246057547543801, + "language_loss": 0.76805508, + "learning_rate": 3.5925761840476826e-06, + "loss": 0.79731309, + "num_input_tokens_seen": 40928180, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.21875, + "step": 1918, + "time_per_iteration": 2.9802286624908447 + }, + { + "auxiliary_loss_clip": 0.01555465, + "auxiliary_loss_mlp": 0.01349759, + "balance_loss_clip": 1.17479742, + "balance_loss_mlp": 1.02817988, + "epoch": 0.23074610713641555, + "flos": 27858511496160.0, + "grad_norm": 2.7182825351905757, + "language_loss": 0.82012665, + "learning_rate": 3.592104849622183e-06, + "loss": 0.84917891, + "num_input_tokens_seen": 40950435, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.21484375, + "step": 1919, + "time_per_iteration": 3.0360238552093506 + }, + { + "auxiliary_loss_clip": 0.01552818, + "auxiliary_loss_mlp": 0.0135667, + "balance_loss_clip": 1.16962945, + "balance_loss_mlp": 1.03375602, + "epoch": 0.23086635002705466, + "flos": 28844425923840.0, + "grad_norm": 1.5798129771117513, + "language_loss": 0.73552889, + "learning_rate": 3.591633273675644e-06, + "loss": 0.76462376, + "num_input_tokens_seen": 40972670, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 3.22851562, + "step": 1920, + "time_per_iteration": 3.1535871028900146 + }, + { + "auxiliary_loss_clip": 0.01611076, + "auxiliary_loss_mlp": 0.01249802, + "balance_loss_clip": 1.22347474, + "balance_loss_mlp": 0.99421692, + "epoch": 0.23098659291769374, + "flos": 62928452044800.0, + "grad_norm": 0.9151321327485906, + "language_loss": 0.58150029, + "learning_rate": 3.591161456279602e-06, + "loss": 0.61010909, + "num_input_tokens_seen": 41018215, + "router_z_loss_clip": 3.875, + "router_z_loss_mlp": 2.5546875, + "step": 1921, + "time_per_iteration": 4.041490793228149 + }, + { + "auxiliary_loss_clip": 0.01552803, + "auxiliary_loss_mlp": 0.01360134, + "balance_loss_clip": 1.17216706, + "balance_loss_mlp": 1.03760111, + "epoch": 0.23110683580833283, + "flos": 23479054855200.0, + "grad_norm": 1.6617275309113362, + "language_loss": 0.80618602, + "learning_rate": 3.590689397505633e-06, + "loss": 0.83531541, + "num_input_tokens_seen": 41039125, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.22460938, + "step": 1922, + "time_per_iteration": 3.055166006088257 + }, + { + "auxiliary_loss_clip": 0.01553975, + "auxiliary_loss_mlp": 0.01352552, + "balance_loss_clip": 1.17175472, + "balance_loss_mlp": 1.03173637, + "epoch": 0.2312270786989719, + "flos": 27273501727200.0, + "grad_norm": 1.7083955801109982, + "language_loss": 0.86774302, + "learning_rate": 3.590217097425347e-06, + "loss": 0.89680827, + "num_input_tokens_seen": 41059025, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 3.20703125, + "step": 1923, + "time_per_iteration": 3.0254132747650146 + }, + { + "auxiliary_loss_clip": 0.01558156, + "auxiliary_loss_mlp": 0.01359592, + "balance_loss_clip": 1.17585981, + "balance_loss_mlp": 1.03839421, + "epoch": 0.23134732158961102, + "flos": 13263040877760.0, + "grad_norm": 2.7583057785218807, + "language_loss": 0.7131896, + "learning_rate": 3.589744556110391e-06, + "loss": 0.74236703, + "num_input_tokens_seen": 41077015, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 3.2109375, + "step": 1924, + "time_per_iteration": 2.88999605178833 + }, + { + "auxiliary_loss_clip": 0.01553038, + "auxiliary_loss_mlp": 0.01350117, + "balance_loss_clip": 1.17137432, + "balance_loss_mlp": 1.02586794, + "epoch": 0.2314675644802501, + "flos": 36980704337760.0, + "grad_norm": 1.7354532211857048, + "language_loss": 0.84729338, + "learning_rate": 3.58927177363245e-06, + "loss": 0.87632501, + "num_input_tokens_seen": 41099840, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.2421875, + "step": 1925, + "time_per_iteration": 3.0471177101135254 + }, + { + "auxiliary_loss_clip": 0.01554447, + "auxiliary_loss_mlp": 0.01351482, + "balance_loss_clip": 1.17230964, + "balance_loss_mlp": 1.02837682, + "epoch": 0.2315878073708892, + "flos": 23844496320000.0, + "grad_norm": 2.918482095496963, + "language_loss": 0.72856683, + "learning_rate": 3.5887987500632447e-06, + "loss": 0.75762612, + "num_input_tokens_seen": 41117845, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 3.23046875, + "step": 1926, + "time_per_iteration": 3.0417943000793457 + }, + { + "auxiliary_loss_clip": 0.01561609, + "auxiliary_loss_mlp": 0.01362508, + "balance_loss_clip": 1.17863107, + "balance_loss_mlp": 1.04436278, + "epoch": 0.2317080502615283, + "flos": 23041662942240.0, + "grad_norm": 1.7976779061267576, + "language_loss": 0.84411341, + "learning_rate": 3.5883254854745325e-06, + "loss": 0.87335455, + "num_input_tokens_seen": 41136235, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 3.1796875, + "step": 1927, + "time_per_iteration": 2.9194624423980713 + }, + { + "auxiliary_loss_clip": 0.015473, + "auxiliary_loss_mlp": 0.01353488, + "balance_loss_clip": 1.16597319, + "balance_loss_mlp": 1.03362608, + "epoch": 0.23182829315216738, + "flos": 11256241894560.0, + "grad_norm": 2.6000369719712593, + "language_loss": 0.7530427, + "learning_rate": 3.587851979938107e-06, + "loss": 0.78205061, + "num_input_tokens_seen": 41153125, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.19726562, + "step": 1928, + "time_per_iteration": 2.982555627822876 + }, + { + "auxiliary_loss_clip": 0.01546775, + "auxiliary_loss_mlp": 0.01360132, + "balance_loss_clip": 1.16560876, + "balance_loss_mlp": 1.03740835, + "epoch": 0.23194853604280646, + "flos": 19831012138080.0, + "grad_norm": 2.3303647547308435, + "language_loss": 0.77351999, + "learning_rate": 3.5873782335257985e-06, + "loss": 0.80258906, + "num_input_tokens_seen": 41171290, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 3.2265625, + "step": 1929, + "time_per_iteration": 2.9672839641571045 + }, + { + "auxiliary_loss_clip": 0.01549743, + "auxiliary_loss_mlp": 0.01364209, + "balance_loss_clip": 1.16844463, + "balance_loss_mlp": 1.03748059, + "epoch": 0.23206877893344555, + "flos": 15306858109440.0, + "grad_norm": 2.3369977259917842, + "language_loss": 0.78525114, + "learning_rate": 3.5869042463094744e-06, + "loss": 0.81439066, + "num_input_tokens_seen": 41189005, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 3.26757812, + "step": 1930, + "time_per_iteration": 3.0607352256774902 + }, + { + "auxiliary_loss_clip": 0.01551389, + "auxiliary_loss_mlp": 0.01379528, + "balance_loss_clip": 1.17104816, + "balance_loss_mlp": 1.05642343, + "epoch": 0.23218902182408466, + "flos": 22713732792000.0, + "grad_norm": 2.230476335318929, + "language_loss": 0.76834065, + "learning_rate": 3.586430018361038e-06, + "loss": 0.79764986, + "num_input_tokens_seen": 41208775, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 3.23046875, + "step": 1931, + "time_per_iteration": 3.026787519454956 + }, + { + "auxiliary_loss_clip": 0.01550239, + "auxiliary_loss_mlp": 0.01358135, + "balance_loss_clip": 1.16806114, + "balance_loss_mlp": 1.0357933, + "epoch": 0.23230926471472374, + "flos": 22712936300640.0, + "grad_norm": 2.9815102933763913, + "language_loss": 0.76631922, + "learning_rate": 3.5859555497524283e-06, + "loss": 0.795403, + "num_input_tokens_seen": 41226010, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 3.22265625, + "step": 1932, + "time_per_iteration": 2.9955062866210938 + }, + { + "auxiliary_loss_clip": 0.01558746, + "auxiliary_loss_mlp": 0.01366124, + "balance_loss_clip": 1.17872632, + "balance_loss_mlp": 1.04588056, + "epoch": 0.23242950760536282, + "flos": 20377562388480.0, + "grad_norm": 2.2835702518413172, + "language_loss": 0.92353034, + "learning_rate": 3.5854808405556237e-06, + "loss": 0.95277905, + "num_input_tokens_seen": 41245245, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 3.20117188, + "step": 1933, + "time_per_iteration": 3.1019792556762695 + }, + { + "auxiliary_loss_clip": 0.01548371, + "auxiliary_loss_mlp": 0.01353368, + "balance_loss_clip": 1.16834617, + "balance_loss_mlp": 1.03445935, + "epoch": 0.23254975049600193, + "flos": 16910135390880.0, + "grad_norm": 3.541724869551357, + "language_loss": 0.75712037, + "learning_rate": 3.5850058908426355e-06, + "loss": 0.7861377, + "num_input_tokens_seen": 41263795, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 3.1875, + "step": 1934, + "time_per_iteration": 3.0391430854797363 + }, + { + "auxiliary_loss_clip": 0.01549205, + "auxiliary_loss_mlp": 0.01346859, + "balance_loss_clip": 1.16883922, + "balance_loss_mlp": 1.02527964, + "epoch": 0.23266999338664102, + "flos": 23297073721920.0, + "grad_norm": 3.717789173834168, + "language_loss": 0.85553694, + "learning_rate": 3.584530700685514e-06, + "loss": 0.88449758, + "num_input_tokens_seen": 41284055, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 3.21484375, + "step": 1935, + "time_per_iteration": 3.0223755836486816 + }, + { + "auxiliary_loss_clip": 0.01559728, + "auxiliary_loss_mlp": 0.0134727, + "balance_loss_clip": 1.18021917, + "balance_loss_mlp": 1.02797961, + "epoch": 0.2327902362772801, + "flos": 19571846470560.0, + "grad_norm": 1.9308027950948217, + "language_loss": 0.89266455, + "learning_rate": 3.5840552701563448e-06, + "loss": 0.92173457, + "num_input_tokens_seen": 41300255, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 3.19140625, + "step": 1936, + "time_per_iteration": 3.057248592376709 + }, + { + "auxiliary_loss_clip": 0.0155748, + "auxiliary_loss_mlp": 0.01363014, + "balance_loss_clip": 1.17566252, + "balance_loss_mlp": 1.03838313, + "epoch": 0.2329104791679192, + "flos": 16729405886880.0, + "grad_norm": 2.354556283578648, + "language_loss": 0.81593585, + "learning_rate": 3.5835795993272513e-06, + "loss": 0.84514076, + "num_input_tokens_seen": 41318540, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 3.24609375, + "step": 1937, + "time_per_iteration": 2.993255376815796 + }, + { + "auxiliary_loss_clip": 0.01554374, + "auxiliary_loss_mlp": 0.01366751, + "balance_loss_clip": 1.17173004, + "balance_loss_mlp": 1.04173875, + "epoch": 0.2330307220585583, + "flos": 22165930912320.0, + "grad_norm": 2.8251114637235615, + "language_loss": 0.71446741, + "learning_rate": 3.583103688270391e-06, + "loss": 0.74367863, + "num_input_tokens_seen": 41338320, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 3.25, + "step": 1938, + "time_per_iteration": 3.0344290733337402 + }, + { + "auxiliary_loss_clip": 0.01560944, + "auxiliary_loss_mlp": 0.01350532, + "balance_loss_clip": 1.17935276, + "balance_loss_mlp": 1.03028834, + "epoch": 0.23315096494919738, + "flos": 19319318231040.0, + "grad_norm": 2.3218266501096636, + "language_loss": 0.89490712, + "learning_rate": 3.58262753705796e-06, + "loss": 0.92402196, + "num_input_tokens_seen": 41353210, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.20117188, + "step": 1939, + "time_per_iteration": 2.98173451423645 + }, + { + "auxiliary_loss_clip": 0.01591965, + "auxiliary_loss_mlp": 0.0129657, + "balance_loss_clip": 1.20321202, + "balance_loss_mlp": 1.03030396, + "epoch": 0.23327120783983646, + "flos": 53037030175200.0, + "grad_norm": 0.781387835345841, + "language_loss": 0.55489451, + "learning_rate": 3.5821511457621902e-06, + "loss": 0.58377987, + "num_input_tokens_seen": 41410510, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 2.6640625, + "step": 1940, + "time_per_iteration": 4.273868560791016 + }, + { + "auxiliary_loss_clip": 0.01555307, + "auxiliary_loss_mlp": 0.01358044, + "balance_loss_clip": 1.17399621, + "balance_loss_mlp": 1.03474808, + "epoch": 0.23339145073047557, + "flos": 17128793419200.0, + "grad_norm": 3.055213033783543, + "language_loss": 0.81165719, + "learning_rate": 3.5816745144553497e-06, + "loss": 0.84079075, + "num_input_tokens_seen": 41425830, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 3.23242188, + "step": 1941, + "time_per_iteration": 3.953540802001953 + }, + { + "auxiliary_loss_clip": 0.01554944, + "auxiliary_loss_mlp": 0.01360044, + "balance_loss_clip": 1.17423058, + "balance_loss_mlp": 1.03922796, + "epoch": 0.23351169362111465, + "flos": 13080604606560.0, + "grad_norm": 1.9462913078836792, + "language_loss": 0.75768948, + "learning_rate": 3.5811976432097424e-06, + "loss": 0.78683937, + "num_input_tokens_seen": 41443500, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 3.20703125, + "step": 1942, + "time_per_iteration": 3.016392469406128 + }, + { + "auxiliary_loss_clip": 0.01556153, + "auxiliary_loss_mlp": 0.01361459, + "balance_loss_clip": 1.17590928, + "balance_loss_mlp": 1.04197848, + "epoch": 0.23363193651175373, + "flos": 15853067006400.0, + "grad_norm": 2.0135853119607234, + "language_loss": 0.84526098, + "learning_rate": 3.58072053209771e-06, + "loss": 0.87443703, + "num_input_tokens_seen": 41460055, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 3.19335938, + "step": 1943, + "time_per_iteration": 2.9928863048553467 + }, + { + "auxiliary_loss_clip": 0.01552376, + "auxiliary_loss_mlp": 0.01357703, + "balance_loss_clip": 1.17153168, + "balance_loss_mlp": 1.03765035, + "epoch": 0.23375217940239285, + "flos": 21027505896000.0, + "grad_norm": 4.539308395173403, + "language_loss": 0.79121268, + "learning_rate": 3.5802431811916296e-06, + "loss": 0.82031345, + "num_input_tokens_seen": 41476665, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 3.19921875, + "step": 1944, + "time_per_iteration": 3.884370803833008 + }, + { + "auxiliary_loss_clip": 0.01553026, + "auxiliary_loss_mlp": 0.01370209, + "balance_loss_clip": 1.173033, + "balance_loss_mlp": 1.04767656, + "epoch": 0.23387242229303193, + "flos": 20596789339200.0, + "grad_norm": 1.8888647764988329, + "language_loss": 0.80683374, + "learning_rate": 3.579765590563916e-06, + "loss": 0.83606607, + "num_input_tokens_seen": 41496065, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 3.22460938, + "step": 1945, + "time_per_iteration": 3.0600626468658447 + }, + { + "auxiliary_loss_clip": 0.01559399, + "auxiliary_loss_mlp": 0.01368573, + "balance_loss_clip": 1.17791915, + "balance_loss_mlp": 1.04909182, + "epoch": 0.233992665183671, + "flos": 24281774448480.0, + "grad_norm": 2.243174738971978, + "language_loss": 0.82071126, + "learning_rate": 3.579287760287017e-06, + "loss": 0.84999096, + "num_input_tokens_seen": 41516815, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 3.19335938, + "step": 1946, + "time_per_iteration": 3.0289885997772217 + }, + { + "auxiliary_loss_clip": 0.01563323, + "auxiliary_loss_mlp": 0.01359803, + "balance_loss_clip": 1.18280208, + "balance_loss_mlp": 1.04280138, + "epoch": 0.2341129080743101, + "flos": 30157284369600.0, + "grad_norm": 1.9854074511203377, + "language_loss": 0.73196787, + "learning_rate": 3.578809690433421e-06, + "loss": 0.76119912, + "num_input_tokens_seen": 41538525, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 3.16796875, + "step": 1947, + "time_per_iteration": 3.042797088623047 + }, + { + "auxiliary_loss_clip": 0.01561712, + "auxiliary_loss_mlp": 0.01364128, + "balance_loss_clip": 1.18141675, + "balance_loss_mlp": 1.0457921, + "epoch": 0.2342331509649492, + "flos": 22786365947040.0, + "grad_norm": 2.613181690612259, + "language_loss": 0.81748033, + "learning_rate": 3.578331381075651e-06, + "loss": 0.8467387, + "num_input_tokens_seen": 41559025, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 3.18164062, + "step": 1948, + "time_per_iteration": 3.870478868484497 + }, + { + "auxiliary_loss_clip": 0.01561242, + "auxiliary_loss_mlp": 0.0135845, + "balance_loss_clip": 1.18261623, + "balance_loss_mlp": 1.03706217, + "epoch": 0.2343533938555883, + "flos": 23625421081920.0, + "grad_norm": 2.7226216462093538, + "language_loss": 0.69348067, + "learning_rate": 3.5778528322862646e-06, + "loss": 0.72267759, + "num_input_tokens_seen": 41577845, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.21289062, + "step": 1949, + "time_per_iteration": 2.9949100017547607 + }, + { + "auxiliary_loss_clip": 0.01564651, + "auxiliary_loss_mlp": 0.01359821, + "balance_loss_clip": 1.18587995, + "balance_loss_mlp": 1.03995824, + "epoch": 0.23447363674622737, + "flos": 24572572565760.0, + "grad_norm": 1.7100554230481737, + "language_loss": 0.86579275, + "learning_rate": 3.5773740441378585e-06, + "loss": 0.89503741, + "num_input_tokens_seen": 41598600, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.19726562, + "step": 1950, + "time_per_iteration": 3.0098178386688232 + }, + { + "auxiliary_loss_clip": 0.01563952, + "auxiliary_loss_mlp": 0.01355334, + "balance_loss_clip": 1.1840173, + "balance_loss_mlp": 1.03432703, + "epoch": 0.23459387963686648, + "flos": 53144254540800.0, + "grad_norm": 1.6183350959904712, + "language_loss": 0.73773098, + "learning_rate": 3.5768950167030633e-06, + "loss": 0.76692379, + "num_input_tokens_seen": 41623300, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 3.20898438, + "step": 1951, + "time_per_iteration": 3.237889051437378 + }, + { + "auxiliary_loss_clip": 0.01572052, + "auxiliary_loss_mlp": 0.01365851, + "balance_loss_clip": 1.19230962, + "balance_loss_mlp": 1.04331851, + "epoch": 0.23471412252750556, + "flos": 23953692585600.0, + "grad_norm": 2.5347160454402307, + "language_loss": 0.78660786, + "learning_rate": 3.576415750054548e-06, + "loss": 0.81598687, + "num_input_tokens_seen": 41643420, + "router_z_loss_clip": 3.796875, + "router_z_loss_mlp": 3.22460938, + "step": 1952, + "time_per_iteration": 3.080298900604248 + }, + { + "auxiliary_loss_clip": 0.01559761, + "auxiliary_loss_mlp": 0.01355843, + "balance_loss_clip": 1.18052197, + "balance_loss_mlp": 1.03636217, + "epoch": 0.23483436541814465, + "flos": 15708445475040.0, + "grad_norm": 2.1758470670966403, + "language_loss": 0.85825253, + "learning_rate": 3.5759362442650172e-06, + "loss": 0.88740861, + "num_input_tokens_seen": 41660170, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 3.19335938, + "step": 1953, + "time_per_iteration": 2.993536949157715 + }, + { + "auxiliary_loss_clip": 0.01566442, + "auxiliary_loss_mlp": 0.01359956, + "balance_loss_clip": 1.18809652, + "balance_loss_mlp": 1.04238248, + "epoch": 0.23495460830878373, + "flos": 24938582952960.0, + "grad_norm": 2.9902477592170422, + "language_loss": 0.85396028, + "learning_rate": 3.5754564994072113e-06, + "loss": 0.88322413, + "num_input_tokens_seen": 41679010, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 3.17382812, + "step": 1954, + "time_per_iteration": 2.9972965717315674 + }, + { + "auxiliary_loss_clip": 0.01569692, + "auxiliary_loss_mlp": 0.01361985, + "balance_loss_clip": 1.19161832, + "balance_loss_mlp": 1.03811765, + "epoch": 0.23507485119942284, + "flos": 30485328304320.0, + "grad_norm": 2.2813942134051635, + "language_loss": 0.59957129, + "learning_rate": 3.5749765155539067e-06, + "loss": 0.62888807, + "num_input_tokens_seen": 41699495, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.23828125, + "step": 1955, + "time_per_iteration": 3.136340618133545 + }, + { + "auxiliary_loss_clip": 0.01567736, + "auxiliary_loss_mlp": 0.01358175, + "balance_loss_clip": 1.18877614, + "balance_loss_mlp": 1.03259051, + "epoch": 0.23519509409006192, + "flos": 18330900544800.0, + "grad_norm": 2.3728038476029214, + "language_loss": 0.92775512, + "learning_rate": 3.574496292777917e-06, + "loss": 0.9570142, + "num_input_tokens_seen": 41717705, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 3.25585938, + "step": 1956, + "time_per_iteration": 3.071545124053955 + }, + { + "auxiliary_loss_clip": 0.01559341, + "auxiliary_loss_mlp": 0.01355587, + "balance_loss_clip": 1.18095994, + "balance_loss_mlp": 1.0298121, + "epoch": 0.235315336980701, + "flos": 29645818031520.0, + "grad_norm": 2.1282501439669845, + "language_loss": 0.71806777, + "learning_rate": 3.574015831152092e-06, + "loss": 0.74721706, + "num_input_tokens_seen": 41738120, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.2578125, + "step": 1957, + "time_per_iteration": 3.002741575241089 + }, + { + "auxiliary_loss_clip": 0.01561658, + "auxiliary_loss_mlp": 0.01363198, + "balance_loss_clip": 1.18091893, + "balance_loss_mlp": 1.04276395, + "epoch": 0.23543557987134012, + "flos": 18553692742560.0, + "grad_norm": 3.0739968797132637, + "language_loss": 0.83865988, + "learning_rate": 3.573535130749316e-06, + "loss": 0.86790848, + "num_input_tokens_seen": 41756070, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.203125, + "step": 1958, + "time_per_iteration": 3.0714356899261475 + }, + { + "auxiliary_loss_clip": 0.01568292, + "auxiliary_loss_mlp": 0.01352926, + "balance_loss_clip": 1.19000638, + "balance_loss_mlp": 1.03363574, + "epoch": 0.2355558227619792, + "flos": 24681237837120.0, + "grad_norm": 1.6909421834866094, + "language_loss": 0.73929191, + "learning_rate": 3.5730541916425127e-06, + "loss": 0.76850414, + "num_input_tokens_seen": 41777550, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.19140625, + "step": 1959, + "time_per_iteration": 2.9830973148345947 + }, + { + "auxiliary_loss_clip": 0.01563689, + "auxiliary_loss_mlp": 0.01349276, + "balance_loss_clip": 1.18490255, + "balance_loss_mlp": 1.0257901, + "epoch": 0.23567606565261828, + "flos": 21946969458720.0, + "grad_norm": 3.969425765812048, + "language_loss": 0.86423182, + "learning_rate": 3.572573013904639e-06, + "loss": 0.89336151, + "num_input_tokens_seen": 41797460, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 3.234375, + "step": 1960, + "time_per_iteration": 3.037593126296997 + }, + { + "auxiliary_loss_clip": 0.01557679, + "auxiliary_loss_mlp": 0.01370379, + "balance_loss_clip": 1.17850137, + "balance_loss_mlp": 1.0501349, + "epoch": 0.2357963085432574, + "flos": 13591046884320.0, + "grad_norm": 1.7920660913874105, + "language_loss": 0.91989613, + "learning_rate": 3.572091597608689e-06, + "loss": 0.94917667, + "num_input_tokens_seen": 41815585, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.20117188, + "step": 1961, + "time_per_iteration": 3.0045948028564453 + }, + { + "auxiliary_loss_clip": 0.01558963, + "auxiliary_loss_mlp": 0.01355827, + "balance_loss_clip": 1.17873871, + "balance_loss_mlp": 1.03443837, + "epoch": 0.23591655143389648, + "flos": 22090756570560.0, + "grad_norm": 2.4369391645285052, + "language_loss": 0.73555648, + "learning_rate": 3.571609942827694e-06, + "loss": 0.76470447, + "num_input_tokens_seen": 41834700, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 3.21289062, + "step": 1962, + "time_per_iteration": 2.998943328857422 + }, + { + "auxiliary_loss_clip": 0.01562057, + "auxiliary_loss_mlp": 0.01356238, + "balance_loss_clip": 1.18113673, + "balance_loss_mlp": 1.03904605, + "epoch": 0.23603679432453556, + "flos": 17019142015680.0, + "grad_norm": 2.0951602610000983, + "language_loss": 0.88886791, + "learning_rate": 3.57112804963472e-06, + "loss": 0.91805089, + "num_input_tokens_seen": 41852915, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 3.16992188, + "step": 1963, + "time_per_iteration": 3.0018017292022705 + }, + { + "auxiliary_loss_clip": 0.01567505, + "auxiliary_loss_mlp": 0.01345251, + "balance_loss_clip": 1.1862452, + "balance_loss_mlp": 1.0274868, + "epoch": 0.23615703721517464, + "flos": 19173293357760.0, + "grad_norm": 1.9721023195592207, + "language_loss": 0.76571989, + "learning_rate": 3.57064591810287e-06, + "loss": 0.79484749, + "num_input_tokens_seen": 41870415, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 3.17578125, + "step": 1964, + "time_per_iteration": 2.9040932655334473 + }, + { + "auxiliary_loss_clip": 0.01563644, + "auxiliary_loss_mlp": 0.01362982, + "balance_loss_clip": 1.18298984, + "balance_loss_mlp": 1.04216576, + "epoch": 0.23627728010581375, + "flos": 19100811915360.0, + "grad_norm": 14.537574097521645, + "language_loss": 0.80601728, + "learning_rate": 3.570163548305284e-06, + "loss": 0.83528352, + "num_input_tokens_seen": 41889345, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 3.20703125, + "step": 1965, + "time_per_iteration": 3.025799512863159 + }, + { + "auxiliary_loss_clip": 0.01568919, + "auxiliary_loss_mlp": 0.01361418, + "balance_loss_clip": 1.18779707, + "balance_loss_mlp": 1.04079247, + "epoch": 0.23639752299645284, + "flos": 14283925433280.0, + "grad_norm": 3.1869054197194364, + "language_loss": 0.70081955, + "learning_rate": 3.569680940315135e-06, + "loss": 0.73012292, + "num_input_tokens_seen": 41905745, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 3.20507812, + "step": 1966, + "time_per_iteration": 2.9462661743164062 + }, + { + "auxiliary_loss_clip": 0.01564449, + "auxiliary_loss_mlp": 0.01368043, + "balance_loss_clip": 1.18309927, + "balance_loss_mlp": 1.04570127, + "epoch": 0.23651776588709192, + "flos": 22895638068960.0, + "grad_norm": 6.712297485509856, + "language_loss": 0.82107913, + "learning_rate": 3.5691980942056356e-06, + "loss": 0.85040408, + "num_input_tokens_seen": 41925115, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 3.22265625, + "step": 1967, + "time_per_iteration": 2.992021083831787 + }, + { + "auxiliary_loss_clip": 0.01560439, + "auxiliary_loss_mlp": 0.01356396, + "balance_loss_clip": 1.18199432, + "balance_loss_mlp": 1.03233767, + "epoch": 0.23663800877773103, + "flos": 18626515538400.0, + "grad_norm": 2.115259575748825, + "language_loss": 0.79668522, + "learning_rate": 3.5687150100500332e-06, + "loss": 0.82585359, + "num_input_tokens_seen": 41944815, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.24023438, + "step": 1968, + "time_per_iteration": 3.8693673610687256 + }, + { + "auxiliary_loss_clip": 0.01561176, + "auxiliary_loss_mlp": 0.01361704, + "balance_loss_clip": 1.18168235, + "balance_loss_mlp": 1.04279482, + "epoch": 0.2367582516683701, + "flos": 25558145640000.0, + "grad_norm": 1.9397909931144068, + "language_loss": 0.74678457, + "learning_rate": 3.568231687921611e-06, + "loss": 0.77601331, + "num_input_tokens_seen": 41964990, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 3.1875, + "step": 1969, + "time_per_iteration": 3.9144585132598877 + }, + { + "auxiliary_loss_clip": 0.01560801, + "auxiliary_loss_mlp": 0.01353275, + "balance_loss_clip": 1.18214083, + "balance_loss_mlp": 1.03226781, + "epoch": 0.2368784945590092, + "flos": 23297339219040.0, + "grad_norm": 1.6303171096015683, + "language_loss": 0.80475616, + "learning_rate": 3.5677481278936883e-06, + "loss": 0.83389693, + "num_input_tokens_seen": 41984570, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 3.20898438, + "step": 1970, + "time_per_iteration": 2.9251365661621094 + }, + { + "auxiliary_loss_clip": 0.01642362, + "auxiliary_loss_mlp": 0.01242523, + "balance_loss_clip": 1.25611019, + "balance_loss_mlp": 0.99075317, + "epoch": 0.23699873744964828, + "flos": 69866681281920.0, + "grad_norm": 0.8444272857940011, + "language_loss": 0.57786214, + "learning_rate": 3.5672643300396214e-06, + "loss": 0.60671097, + "num_input_tokens_seen": 42053715, + "router_z_loss_clip": 3.859375, + "router_z_loss_mlp": 2.515625, + "step": 1971, + "time_per_iteration": 3.495119094848633 + }, + { + "auxiliary_loss_clip": 0.01569681, + "auxiliary_loss_mlp": 0.0135195, + "balance_loss_clip": 1.19057643, + "balance_loss_mlp": 1.03285074, + "epoch": 0.2371189803402874, + "flos": 21837204270720.0, + "grad_norm": 2.592519684840776, + "language_loss": 0.67921877, + "learning_rate": 3.566780294432802e-06, + "loss": 0.70843506, + "num_input_tokens_seen": 42070890, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 3.18945312, + "step": 1972, + "time_per_iteration": 3.919602155685425 + }, + { + "auxiliary_loss_clip": 0.01570688, + "auxiliary_loss_mlp": 0.01358804, + "balance_loss_clip": 1.18957829, + "balance_loss_mlp": 1.03913212, + "epoch": 0.23723922323092647, + "flos": 21910671845280.0, + "grad_norm": 2.363060333968133, + "language_loss": 0.7487818, + "learning_rate": 3.566296021146657e-06, + "loss": 0.77807671, + "num_input_tokens_seen": 42090270, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 3.1953125, + "step": 1973, + "time_per_iteration": 2.992128372192383 + }, + { + "auxiliary_loss_clip": 0.01574378, + "auxiliary_loss_mlp": 0.01357969, + "balance_loss_clip": 1.1949265, + "balance_loss_mlp": 1.0344826, + "epoch": 0.23735946612156555, + "flos": 32711619735360.0, + "grad_norm": 1.8687309515105601, + "language_loss": 0.73444468, + "learning_rate": 3.565811510254652e-06, + "loss": 0.76376808, + "num_input_tokens_seen": 42111150, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 3.234375, + "step": 1974, + "time_per_iteration": 3.144599199295044 + }, + { + "auxiliary_loss_clip": 0.01645203, + "auxiliary_loss_mlp": 0.01264061, + "balance_loss_clip": 1.2602011, + "balance_loss_mlp": 1.01076508, + "epoch": 0.23747970901220466, + "flos": 70554060247680.0, + "grad_norm": 0.8540941584601227, + "language_loss": 0.58203572, + "learning_rate": 3.5653267618302845e-06, + "loss": 0.61112833, + "num_input_tokens_seen": 42178730, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 2.53125, + "step": 1975, + "time_per_iteration": 4.28976583480835 + }, + { + "auxiliary_loss_clip": 0.01565415, + "auxiliary_loss_mlp": 0.01365286, + "balance_loss_clip": 1.18495762, + "balance_loss_mlp": 1.04313445, + "epoch": 0.23759995190284375, + "flos": 20851896693600.0, + "grad_norm": 1.8230056616868289, + "language_loss": 0.85541081, + "learning_rate": 3.564841775947093e-06, + "loss": 0.88471782, + "num_input_tokens_seen": 42199620, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 3.22070312, + "step": 1976, + "time_per_iteration": 3.021134853363037 + }, + { + "auxiliary_loss_clip": 0.01562934, + "auxiliary_loss_mlp": 0.01364487, + "balance_loss_clip": 1.18439579, + "balance_loss_mlp": 1.04061937, + "epoch": 0.23772019479348283, + "flos": 32924133401760.0, + "grad_norm": 2.4805196156615996, + "language_loss": 0.76059616, + "learning_rate": 3.5643565526786475e-06, + "loss": 0.78987038, + "num_input_tokens_seen": 42219560, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.23828125, + "step": 1977, + "time_per_iteration": 3.0518269538879395 + }, + { + "auxiliary_loss_clip": 0.01559146, + "auxiliary_loss_mlp": 0.01361455, + "balance_loss_clip": 1.18010414, + "balance_loss_mlp": 1.04235554, + "epoch": 0.2378404376841219, + "flos": 32345495563680.0, + "grad_norm": 1.7219800945649515, + "language_loss": 0.77278018, + "learning_rate": 3.5638710920985574e-06, + "loss": 0.80198622, + "num_input_tokens_seen": 42241020, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 3.18945312, + "step": 1978, + "time_per_iteration": 3.118008852005005 + }, + { + "auxiliary_loss_clip": 0.01557757, + "auxiliary_loss_mlp": 0.01366159, + "balance_loss_clip": 1.17945015, + "balance_loss_mlp": 1.04477084, + "epoch": 0.23796068057476102, + "flos": 22999562320320.0, + "grad_norm": 1.8118630419392607, + "language_loss": 0.82201582, + "learning_rate": 3.5633853942804655e-06, + "loss": 0.851255, + "num_input_tokens_seen": 42259345, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.21289062, + "step": 1979, + "time_per_iteration": 3.0307576656341553 + }, + { + "auxiliary_loss_clip": 0.01558136, + "auxiliary_loss_mlp": 0.01366092, + "balance_loss_clip": 1.17909408, + "balance_loss_mlp": 1.04470372, + "epoch": 0.2380809234654001, + "flos": 13482685038240.0, + "grad_norm": 3.297212037838517, + "language_loss": 0.76868594, + "learning_rate": 3.5628994592980527e-06, + "loss": 0.79792821, + "num_input_tokens_seen": 42277250, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 3.21289062, + "step": 1980, + "time_per_iteration": 3.070441961288452 + }, + { + "auxiliary_loss_clip": 0.01559342, + "auxiliary_loss_mlp": 0.01377426, + "balance_loss_clip": 1.18135631, + "balance_loss_mlp": 1.05508375, + "epoch": 0.2382011663560392, + "flos": 16873610208480.0, + "grad_norm": 2.4983208527190333, + "language_loss": 0.70452189, + "learning_rate": 3.562413287225034e-06, + "loss": 0.73388958, + "num_input_tokens_seen": 42295360, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.22265625, + "step": 1981, + "time_per_iteration": 3.0495169162750244 + }, + { + "auxiliary_loss_clip": 0.01570205, + "auxiliary_loss_mlp": 0.01363359, + "balance_loss_clip": 1.19280076, + "balance_loss_mlp": 1.04158974, + "epoch": 0.2383214092466783, + "flos": 18443055206880.0, + "grad_norm": 2.2421743264517757, + "language_loss": 0.89504141, + "learning_rate": 3.5619268781351623e-06, + "loss": 0.92437708, + "num_input_tokens_seen": 42313430, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.21679688, + "step": 1982, + "time_per_iteration": 2.933671712875366 + }, + { + "auxiliary_loss_clip": 0.01561632, + "auxiliary_loss_mlp": 0.01357542, + "balance_loss_clip": 1.18236804, + "balance_loss_mlp": 1.03748894, + "epoch": 0.23844165213731738, + "flos": 19757772132480.0, + "grad_norm": 2.0929195660361652, + "language_loss": 0.76859617, + "learning_rate": 3.5614402321022256e-06, + "loss": 0.7977879, + "num_input_tokens_seen": 42331260, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.19921875, + "step": 1983, + "time_per_iteration": 2.9817991256713867 + }, + { + "auxiliary_loss_clip": 0.01559934, + "auxiliary_loss_mlp": 0.01355262, + "balance_loss_clip": 1.18127239, + "balance_loss_mlp": 1.0395962, + "epoch": 0.23856189502795647, + "flos": 23369593092480.0, + "grad_norm": 2.5105651448762067, + "language_loss": 0.87407541, + "learning_rate": 3.5609533492000463e-06, + "loss": 0.90322739, + "num_input_tokens_seen": 42350150, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.15429688, + "step": 1984, + "time_per_iteration": 3.0168259143829346 + }, + { + "auxiliary_loss_clip": 0.01565116, + "auxiliary_loss_mlp": 0.01354193, + "balance_loss_clip": 1.18618202, + "balance_loss_mlp": 1.03394926, + "epoch": 0.23868213791859555, + "flos": 23477196375360.0, + "grad_norm": 2.178168734793864, + "language_loss": 0.78526938, + "learning_rate": 3.560466229502485e-06, + "loss": 0.81446254, + "num_input_tokens_seen": 42369495, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.20117188, + "step": 1985, + "time_per_iteration": 3.008774757385254 + }, + { + "auxiliary_loss_clip": 0.01566416, + "auxiliary_loss_mlp": 0.01362385, + "balance_loss_clip": 1.18874013, + "balance_loss_mlp": 1.04519272, + "epoch": 0.23880238080923466, + "flos": 16619185560960.0, + "grad_norm": 2.46054679743156, + "language_loss": 0.89982557, + "learning_rate": 3.5599788730834384e-06, + "loss": 0.92911363, + "num_input_tokens_seen": 42387455, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.16992188, + "step": 1986, + "time_per_iteration": 2.9913876056671143 + }, + { + "auxiliary_loss_clip": 0.01568314, + "auxiliary_loss_mlp": 0.01353993, + "balance_loss_clip": 1.19127572, + "balance_loss_mlp": 1.03451192, + "epoch": 0.23892262369987374, + "flos": 17350371915840.0, + "grad_norm": 2.844269175274086, + "language_loss": 0.78693956, + "learning_rate": 3.559491280016836e-06, + "loss": 0.81616265, + "num_input_tokens_seen": 42405400, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.19335938, + "step": 1987, + "time_per_iteration": 2.989781618118286 + }, + { + "auxiliary_loss_clip": 0.01564445, + "auxiliary_loss_mlp": 0.01354354, + "balance_loss_clip": 1.1873467, + "balance_loss_mlp": 1.03620839, + "epoch": 0.23904286659051283, + "flos": 22312107498240.0, + "grad_norm": 2.6012512386788185, + "language_loss": 0.70828688, + "learning_rate": 3.5590034503766465e-06, + "loss": 0.73747492, + "num_input_tokens_seen": 42425065, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.1796875, + "step": 1988, + "time_per_iteration": 3.0419704914093018 + }, + { + "auxiliary_loss_clip": 0.01570393, + "auxiliary_loss_mlp": 0.01341987, + "balance_loss_clip": 1.19406581, + "balance_loss_mlp": 1.02517629, + "epoch": 0.23916310948115194, + "flos": 21180357838080.0, + "grad_norm": 2.139114324157419, + "language_loss": 0.80999076, + "learning_rate": 3.558515384236874e-06, + "loss": 0.83911461, + "num_input_tokens_seen": 42442495, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.16601562, + "step": 1989, + "time_per_iteration": 2.991551399230957 + }, + { + "auxiliary_loss_clip": 0.01571199, + "auxiliary_loss_mlp": 0.01357765, + "balance_loss_clip": 1.19562221, + "balance_loss_mlp": 1.04000056, + "epoch": 0.23928335237179102, + "flos": 14139114261120.0, + "grad_norm": 2.400770237999985, + "language_loss": 0.84153873, + "learning_rate": 3.558027081671556e-06, + "loss": 0.87082839, + "num_input_tokens_seen": 42459480, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.17578125, + "step": 1990, + "time_per_iteration": 3.061889886856079 + }, + { + "auxiliary_loss_clip": 0.01569288, + "auxiliary_loss_mlp": 0.01370909, + "balance_loss_clip": 1.19256198, + "balance_loss_mlp": 1.05047417, + "epoch": 0.2394035952624301, + "flos": 23771901093120.0, + "grad_norm": 1.8842242223293701, + "language_loss": 0.68870836, + "learning_rate": 3.557538542754769e-06, + "loss": 0.71811032, + "num_input_tokens_seen": 42479175, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.203125, + "step": 1991, + "time_per_iteration": 3.0521912574768066 + }, + { + "auxiliary_loss_clip": 0.01571919, + "auxiliary_loss_mlp": 0.01356473, + "balance_loss_clip": 1.19378006, + "balance_loss_mlp": 1.03661048, + "epoch": 0.2395238381530692, + "flos": 24208458586560.0, + "grad_norm": 1.9337068029742466, + "language_loss": 0.66946745, + "learning_rate": 3.557049767560623e-06, + "loss": 0.69875139, + "num_input_tokens_seen": 42498090, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 3.19726562, + "step": 1992, + "time_per_iteration": 3.0382509231567383 + }, + { + "auxiliary_loss_clip": 0.01581084, + "auxiliary_loss_mlp": 0.01358698, + "balance_loss_clip": 1.20378804, + "balance_loss_mlp": 1.04169703, + "epoch": 0.2396440810437083, + "flos": 25297804199520.0, + "grad_norm": 2.271483252752329, + "language_loss": 0.85920852, + "learning_rate": 3.5565607561632655e-06, + "loss": 0.88860631, + "num_input_tokens_seen": 42516930, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.16796875, + "step": 1993, + "time_per_iteration": 3.111929178237915 + }, + { + "auxiliary_loss_clip": 0.01569355, + "auxiliary_loss_mlp": 0.01357746, + "balance_loss_clip": 1.19072473, + "balance_loss_mlp": 1.03883743, + "epoch": 0.23976432393434738, + "flos": 28545321539520.0, + "grad_norm": 2.4919586440917665, + "language_loss": 0.79783213, + "learning_rate": 3.5560715086368787e-06, + "loss": 0.82710314, + "num_input_tokens_seen": 42534800, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.1875, + "step": 1994, + "time_per_iteration": 3.387178659439087 + }, + { + "auxiliary_loss_clip": 0.01571503, + "auxiliary_loss_mlp": 0.01353702, + "balance_loss_clip": 1.19522583, + "balance_loss_mlp": 1.03670096, + "epoch": 0.23988456682498646, + "flos": 19496065278240.0, + "grad_norm": 2.302080603341228, + "language_loss": 0.82176572, + "learning_rate": 3.5555820250556816e-06, + "loss": 0.85101783, + "num_input_tokens_seen": 42552000, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.16796875, + "step": 1995, + "time_per_iteration": 3.7704508304595947 + }, + { + "auxiliary_loss_clip": 0.01581506, + "auxiliary_loss_mlp": 0.01356966, + "balance_loss_clip": 1.20516491, + "balance_loss_mlp": 1.04110909, + "epoch": 0.24000480971562557, + "flos": 20268404051040.0, + "grad_norm": 2.3469346663834028, + "language_loss": 0.69419175, + "learning_rate": 3.5550923054939278e-06, + "loss": 0.72357643, + "num_input_tokens_seen": 42571455, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.15625, + "step": 1996, + "time_per_iteration": 4.058465957641602 + }, + { + "auxiliary_loss_clip": 0.01577141, + "auxiliary_loss_mlp": 0.01350953, + "balance_loss_clip": 1.20015383, + "balance_loss_mlp": 1.03070915, + "epoch": 0.24012505260626466, + "flos": 25445308271040.0, + "grad_norm": 2.380865997261595, + "language_loss": 0.74744463, + "learning_rate": 3.5546023500259083e-06, + "loss": 0.77672559, + "num_input_tokens_seen": 42592550, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.20117188, + "step": 1997, + "time_per_iteration": 3.0356314182281494 + }, + { + "auxiliary_loss_clip": 0.0157735, + "auxiliary_loss_mlp": 0.01356046, + "balance_loss_clip": 1.20099556, + "balance_loss_mlp": 1.03637433, + "epoch": 0.24024529549690374, + "flos": 15554797041600.0, + "grad_norm": 3.274052336465954, + "language_loss": 0.80375433, + "learning_rate": 3.5541121587259477e-06, + "loss": 0.83308834, + "num_input_tokens_seen": 42610385, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.1953125, + "step": 1998, + "time_per_iteration": 2.968715190887451 + }, + { + "auxiliary_loss_clip": 0.01635334, + "auxiliary_loss_mlp": 0.01345108, + "balance_loss_clip": 1.25188565, + "balance_loss_mlp": 1.08647156, + "epoch": 0.24036553838754285, + "flos": 57128836736160.0, + "grad_norm": 0.9178455143037578, + "language_loss": 0.57806218, + "learning_rate": 3.553621731668408e-06, + "loss": 0.60786653, + "num_input_tokens_seen": 42673595, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 2.5859375, + "step": 1999, + "time_per_iteration": 3.4327399730682373 + }, + { + "auxiliary_loss_clip": 0.01568216, + "auxiliary_loss_mlp": 0.01350813, + "balance_loss_clip": 1.19130826, + "balance_loss_mlp": 1.03152311, + "epoch": 0.24048578127818193, + "flos": 24972111810720.0, + "grad_norm": 3.0528976137917314, + "language_loss": 0.83404958, + "learning_rate": 3.553131068927688e-06, + "loss": 0.86323988, + "num_input_tokens_seen": 42692000, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.19140625, + "step": 2000, + "time_per_iteration": 3.912391424179077 + }, + { + "auxiliary_loss_clip": 0.01577821, + "auxiliary_loss_mlp": 0.01363939, + "balance_loss_clip": 1.20104933, + "balance_loss_mlp": 1.04636538, + "epoch": 0.24060602416882101, + "flos": 23333181694560.0, + "grad_norm": 2.5565332257852984, + "language_loss": 0.80120587, + "learning_rate": 3.552640170578219e-06, + "loss": 0.83062339, + "num_input_tokens_seen": 42712250, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.17382812, + "step": 2001, + "time_per_iteration": 2.978944778442383 + }, + { + "auxiliary_loss_clip": 0.01572604, + "auxiliary_loss_mlp": 0.01363022, + "balance_loss_clip": 1.19549358, + "balance_loss_mlp": 1.04735601, + "epoch": 0.2407262670594601, + "flos": 14174994664800.0, + "grad_norm": 3.0696565209064737, + "language_loss": 0.78161883, + "learning_rate": 3.5521490366944703e-06, + "loss": 0.81097513, + "num_input_tokens_seen": 42729900, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 3.15429688, + "step": 2002, + "time_per_iteration": 3.014816999435425 + }, + { + "auxiliary_loss_clip": 0.01580772, + "auxiliary_loss_mlp": 0.01350782, + "balance_loss_clip": 1.20409894, + "balance_loss_mlp": 1.0351162, + "epoch": 0.2408465099500992, + "flos": 13664855812320.0, + "grad_norm": 3.2229752179221443, + "language_loss": 0.80421835, + "learning_rate": 3.5516576673509474e-06, + "loss": 0.83353388, + "num_input_tokens_seen": 42747900, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.15429688, + "step": 2003, + "time_per_iteration": 3.78861403465271 + }, + { + "auxiliary_loss_clip": 0.0157817, + "auxiliary_loss_mlp": 0.01370888, + "balance_loss_clip": 1.19988966, + "balance_loss_mlp": 1.05636668, + "epoch": 0.2409667528407383, + "flos": 31250688295680.0, + "grad_norm": 1.7486454457546137, + "language_loss": 0.86501688, + "learning_rate": 3.5511660626221896e-06, + "loss": 0.89450753, + "num_input_tokens_seen": 42768540, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 3.14257812, + "step": 2004, + "time_per_iteration": 2.9840621948242188 + }, + { + "auxiliary_loss_clip": 0.01580622, + "auxiliary_loss_mlp": 0.01367623, + "balance_loss_clip": 1.20293021, + "balance_loss_mlp": 1.0500493, + "epoch": 0.24108699573137737, + "flos": 22202114741280.0, + "grad_norm": 2.3910884383700943, + "language_loss": 0.89074993, + "learning_rate": 3.5506742225827744e-06, + "loss": 0.92023242, + "num_input_tokens_seen": 42785395, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.17382812, + "step": 2005, + "time_per_iteration": 3.006788492202759 + }, + { + "auxiliary_loss_clip": 0.01578694, + "auxiliary_loss_mlp": 0.01360136, + "balance_loss_clip": 1.20102811, + "balance_loss_mlp": 1.04466057, + "epoch": 0.24120723862201648, + "flos": 26105378597280.0, + "grad_norm": 2.5194474493883314, + "language_loss": 0.90410244, + "learning_rate": 3.5501821473073116e-06, + "loss": 0.93349075, + "num_input_tokens_seen": 42801980, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.15234375, + "step": 2006, + "time_per_iteration": 3.0267696380615234 + }, + { + "auxiliary_loss_clip": 0.01575113, + "auxiliary_loss_mlp": 0.01357933, + "balance_loss_clip": 1.19851422, + "balance_loss_mlp": 1.04112244, + "epoch": 0.24132748151265557, + "flos": 18626932748160.0, + "grad_norm": 2.0294948506719024, + "language_loss": 0.86465603, + "learning_rate": 3.54968983687045e-06, + "loss": 0.89398652, + "num_input_tokens_seen": 42818850, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.16601562, + "step": 2007, + "time_per_iteration": 2.954585552215576 + }, + { + "auxiliary_loss_clip": 0.01571847, + "auxiliary_loss_mlp": 0.01355534, + "balance_loss_clip": 1.19365382, + "balance_loss_mlp": 1.03929555, + "epoch": 0.24144772440329465, + "flos": 15269460579360.0, + "grad_norm": 2.60862921495219, + "language_loss": 0.89404106, + "learning_rate": 3.549197291346872e-06, + "loss": 0.92331493, + "num_input_tokens_seen": 42835375, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 3.16015625, + "step": 2008, + "time_per_iteration": 2.9559710025787354 + }, + { + "auxiliary_loss_clip": 0.01583138, + "auxiliary_loss_mlp": 0.01360126, + "balance_loss_clip": 1.20494795, + "balance_loss_mlp": 1.04159856, + "epoch": 0.24156796729393373, + "flos": 24026553309600.0, + "grad_norm": 2.1004139195627567, + "language_loss": 0.79175097, + "learning_rate": 3.548704510811297e-06, + "loss": 0.82118356, + "num_input_tokens_seen": 42854570, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 3.18359375, + "step": 2009, + "time_per_iteration": 2.9251174926757812 + }, + { + "auxiliary_loss_clip": 0.0157194, + "auxiliary_loss_mlp": 0.013511, + "balance_loss_clip": 1.19339442, + "balance_loss_mlp": 1.03581548, + "epoch": 0.24168821018457284, + "flos": 26289218210400.0, + "grad_norm": 2.420408047470232, + "language_loss": 0.74952579, + "learning_rate": 3.5482114953384787e-06, + "loss": 0.7787562, + "num_input_tokens_seen": 42873800, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.15039062, + "step": 2010, + "time_per_iteration": 2.9969680309295654 + }, + { + "auxiliary_loss_clip": 0.0157789, + "auxiliary_loss_mlp": 0.0134477, + "balance_loss_clip": 1.19851303, + "balance_loss_mlp": 1.027578, + "epoch": 0.24180845307521193, + "flos": 18225155741760.0, + "grad_norm": 2.16811297909125, + "language_loss": 0.84305263, + "learning_rate": 3.5477182450032077e-06, + "loss": 0.87227923, + "num_input_tokens_seen": 42892400, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 3.16992188, + "step": 2011, + "time_per_iteration": 2.9633290767669678 + }, + { + "auxiliary_loss_clip": 0.01574102, + "auxiliary_loss_mlp": 0.01348015, + "balance_loss_clip": 1.19644678, + "balance_loss_mlp": 1.03158605, + "epoch": 0.241928695965851, + "flos": 20451371316480.0, + "grad_norm": 2.9726582407783146, + "language_loss": 0.83534789, + "learning_rate": 3.5472247598803097e-06, + "loss": 0.86456907, + "num_input_tokens_seen": 42911745, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.16210938, + "step": 2012, + "time_per_iteration": 3.0585057735443115 + }, + { + "auxiliary_loss_clip": 0.01574948, + "auxiliary_loss_mlp": 0.01348003, + "balance_loss_clip": 1.19649839, + "balance_loss_mlp": 1.03081131, + "epoch": 0.24204893885649012, + "flos": 25558828346880.0, + "grad_norm": 3.9552154138491336, + "language_loss": 0.85710984, + "learning_rate": 3.546731040044645e-06, + "loss": 0.88633937, + "num_input_tokens_seen": 42926915, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.16992188, + "step": 2013, + "time_per_iteration": 3.2175793647766113 + }, + { + "auxiliary_loss_clip": 0.0157823, + "auxiliary_loss_mlp": 0.01352877, + "balance_loss_clip": 1.19889522, + "balance_loss_mlp": 1.03663826, + "epoch": 0.2421691817471292, + "flos": 30662606345760.0, + "grad_norm": 1.9333376300320528, + "language_loss": 0.75413901, + "learning_rate": 3.546237085571112e-06, + "loss": 0.78345007, + "num_input_tokens_seen": 42945350, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 3.16015625, + "step": 2014, + "time_per_iteration": 3.048351287841797 + }, + { + "auxiliary_loss_clip": 0.01578224, + "auxiliary_loss_mlp": 0.01348425, + "balance_loss_clip": 1.20013809, + "balance_loss_mlp": 1.03180552, + "epoch": 0.24228942463776829, + "flos": 21947197027680.0, + "grad_norm": 2.6337039017917823, + "language_loss": 0.72749937, + "learning_rate": 3.5457428965346425e-06, + "loss": 0.75676584, + "num_input_tokens_seen": 42964290, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.1640625, + "step": 2015, + "time_per_iteration": 3.007011651992798 + }, + { + "auxiliary_loss_clip": 0.01582626, + "auxiliary_loss_mlp": 0.01344255, + "balance_loss_clip": 1.20268178, + "balance_loss_mlp": 1.03011513, + "epoch": 0.2424096675284074, + "flos": 33987308220000.0, + "grad_norm": 1.9221509560174097, + "language_loss": 0.74600172, + "learning_rate": 3.545248473010205e-06, + "loss": 0.77527058, + "num_input_tokens_seen": 42987095, + "router_z_loss_clip": 3.796875, + "router_z_loss_mlp": 3.13867188, + "step": 2016, + "time_per_iteration": 3.0540518760681152 + }, + { + "auxiliary_loss_clip": 0.01575753, + "auxiliary_loss_mlp": 0.01350045, + "balance_loss_clip": 1.19839251, + "balance_loss_mlp": 1.02903795, + "epoch": 0.24252991041904648, + "flos": 21655602419040.0, + "grad_norm": 1.6979813270838562, + "language_loss": 0.87744892, + "learning_rate": 3.544753815072802e-06, + "loss": 0.90670687, + "num_input_tokens_seen": 43005750, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.20898438, + "step": 2017, + "time_per_iteration": 2.960052728652954 + }, + { + "auxiliary_loss_clip": 0.01574535, + "auxiliary_loss_mlp": 0.01354834, + "balance_loss_clip": 1.19653177, + "balance_loss_mlp": 1.03802335, + "epoch": 0.24265015330968556, + "flos": 21872136470400.0, + "grad_norm": 1.9387314002295668, + "language_loss": 0.88513786, + "learning_rate": 3.544258922797474e-06, + "loss": 0.91443157, + "num_input_tokens_seen": 43023870, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.16601562, + "step": 2018, + "time_per_iteration": 3.0093319416046143 + }, + { + "auxiliary_loss_clip": 0.01576142, + "auxiliary_loss_mlp": 0.0136552, + "balance_loss_clip": 1.19899023, + "balance_loss_mlp": 1.04985392, + "epoch": 0.24277039620032465, + "flos": 25630551226080.0, + "grad_norm": 1.8354143025811418, + "language_loss": 0.78478944, + "learning_rate": 3.543763796259295e-06, + "loss": 0.814206, + "num_input_tokens_seen": 43043825, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 3.15429688, + "step": 2019, + "time_per_iteration": 2.9737401008605957 + }, + { + "auxiliary_loss_clip": 0.01571559, + "auxiliary_loss_mlp": 0.01368734, + "balance_loss_clip": 1.1925391, + "balance_loss_mlp": 1.05173302, + "epoch": 0.24289063909096376, + "flos": 26288990641440.0, + "grad_norm": 2.0457093713822148, + "language_loss": 0.91346121, + "learning_rate": 3.5432684355333754e-06, + "loss": 0.94286406, + "num_input_tokens_seen": 43062480, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.16796875, + "step": 2020, + "time_per_iteration": 3.0124778747558594 + }, + { + "auxiliary_loss_clip": 0.01572361, + "auxiliary_loss_mlp": 0.01351259, + "balance_loss_clip": 1.19527793, + "balance_loss_mlp": 1.03635526, + "epoch": 0.24301088198160284, + "flos": 25076984266080.0, + "grad_norm": 7.777503813899524, + "language_loss": 0.77141327, + "learning_rate": 3.5427728406948613e-06, + "loss": 0.80064946, + "num_input_tokens_seen": 43081595, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.14648438, + "step": 2021, + "time_per_iteration": 3.142493963241577 + }, + { + "auxiliary_loss_clip": 0.01641284, + "auxiliary_loss_mlp": 0.0135907, + "balance_loss_clip": 1.25976443, + "balance_loss_mlp": 1.11035156, + "epoch": 0.24313112487224192, + "flos": 69908630191200.0, + "grad_norm": 0.8615747798465039, + "language_loss": 0.57852948, + "learning_rate": 3.542277011818934e-06, + "loss": 0.60853302, + "num_input_tokens_seen": 43145430, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 2.484375, + "step": 2022, + "time_per_iteration": 3.5609941482543945 + }, + { + "auxiliary_loss_clip": 0.01582038, + "auxiliary_loss_mlp": 0.01346948, + "balance_loss_clip": 1.20513725, + "balance_loss_mlp": 1.03204513, + "epoch": 0.24325136776288103, + "flos": 40665575662560.0, + "grad_norm": 2.1757993141924414, + "language_loss": 0.74113178, + "learning_rate": 3.5417809489808104e-06, + "loss": 0.77042162, + "num_input_tokens_seen": 43167040, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.14648438, + "step": 2023, + "time_per_iteration": 4.878771543502808 + }, + { + "auxiliary_loss_clip": 0.01573432, + "auxiliary_loss_mlp": 0.01357891, + "balance_loss_clip": 1.19601047, + "balance_loss_mlp": 1.0399363, + "epoch": 0.24337161065352012, + "flos": 25048765350720.0, + "grad_norm": 2.200655876194423, + "language_loss": 0.7279914, + "learning_rate": 3.5412846522557422e-06, + "loss": 0.75730461, + "num_input_tokens_seen": 43187930, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.17773438, + "step": 2024, + "time_per_iteration": 3.0406622886657715 + }, + { + "auxiliary_loss_clip": 0.01583309, + "auxiliary_loss_mlp": 0.01365649, + "balance_loss_clip": 1.20546341, + "balance_loss_mlp": 1.04597783, + "epoch": 0.2434918535441592, + "flos": 18663154505280.0, + "grad_norm": 3.105204344312934, + "language_loss": 0.74492705, + "learning_rate": 3.540788121719018e-06, + "loss": 0.77441669, + "num_input_tokens_seen": 43206350, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.1953125, + "step": 2025, + "time_per_iteration": 2.9659152030944824 + }, + { + "auxiliary_loss_clip": 0.01574742, + "auxiliary_loss_mlp": 0.01389685, + "balance_loss_clip": 1.19859362, + "balance_loss_mlp": 1.06658006, + "epoch": 0.24361209643479828, + "flos": 23917432900320.0, + "grad_norm": 2.6461403946999504, + "language_loss": 0.82421935, + "learning_rate": 3.5402913574459604e-06, + "loss": 0.85386354, + "num_input_tokens_seen": 43226255, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.23046875, + "step": 2026, + "time_per_iteration": 2.9921185970306396 + }, + { + "auxiliary_loss_clip": 0.01578385, + "auxiliary_loss_mlp": 0.01371738, + "balance_loss_clip": 1.20125055, + "balance_loss_mlp": 1.05130386, + "epoch": 0.2437323393254374, + "flos": 28660017388320.0, + "grad_norm": 2.2365886116985356, + "language_loss": 0.86557639, + "learning_rate": 3.5397943595119297e-06, + "loss": 0.89507759, + "num_input_tokens_seen": 43247675, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 3.203125, + "step": 2027, + "time_per_iteration": 3.041463851928711 + }, + { + "auxiliary_loss_clip": 0.01573844, + "auxiliary_loss_mlp": 0.0138213, + "balance_loss_clip": 1.19772148, + "balance_loss_mlp": 1.0607419, + "epoch": 0.24385258221607647, + "flos": 23552598286080.0, + "grad_norm": 2.6127133475039503, + "language_loss": 0.77509177, + "learning_rate": 3.5392971279923177e-06, + "loss": 0.80465156, + "num_input_tokens_seen": 43265895, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.21289062, + "step": 2028, + "time_per_iteration": 3.8314850330352783 + }, + { + "auxiliary_loss_clip": 0.01576713, + "auxiliary_loss_mlp": 0.01379187, + "balance_loss_clip": 1.20204437, + "balance_loss_mlp": 1.0562731, + "epoch": 0.24397282510671556, + "flos": 25338349766880.0, + "grad_norm": 2.386302413752652, + "language_loss": 0.82970345, + "learning_rate": 3.5387996629625557e-06, + "loss": 0.85926247, + "num_input_tokens_seen": 43283485, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.22851562, + "step": 2029, + "time_per_iteration": 2.9924237728118896 + }, + { + "auxiliary_loss_clip": 0.01619603, + "auxiliary_loss_mlp": 0.01383659, + "balance_loss_clip": 1.23947573, + "balance_loss_mlp": 1.11434174, + "epoch": 0.24409306799735467, + "flos": 65194454259360.0, + "grad_norm": 0.8502337275458054, + "language_loss": 0.54888183, + "learning_rate": 3.5383019644981083e-06, + "loss": 0.5789144, + "num_input_tokens_seen": 43347180, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 2.6953125, + "step": 2030, + "time_per_iteration": 4.229549407958984 + }, + { + "auxiliary_loss_clip": 0.01573821, + "auxiliary_loss_mlp": 0.01382487, + "balance_loss_clip": 1.19965744, + "balance_loss_mlp": 1.0578562, + "epoch": 0.24421331088799375, + "flos": 19539152032320.0, + "grad_norm": 2.2516192314178354, + "language_loss": 0.73185408, + "learning_rate": 3.5378040326744763e-06, + "loss": 0.76141715, + "num_input_tokens_seen": 43366665, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.24609375, + "step": 2031, + "time_per_iteration": 2.9778223037719727 + }, + { + "auxiliary_loss_clip": 0.0156671, + "auxiliary_loss_mlp": 0.013682, + "balance_loss_clip": 1.18896699, + "balance_loss_mlp": 1.04871941, + "epoch": 0.24433355377863283, + "flos": 21070668506400.0, + "grad_norm": 2.431423282363003, + "language_loss": 0.85817438, + "learning_rate": 3.5373058675671946e-06, + "loss": 0.88752353, + "num_input_tokens_seen": 43384670, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 3.19335938, + "step": 2032, + "time_per_iteration": 3.0674235820770264 + }, + { + "auxiliary_loss_clip": 0.01565083, + "auxiliary_loss_mlp": 0.01359573, + "balance_loss_clip": 1.19033885, + "balance_loss_mlp": 1.04047358, + "epoch": 0.24445379666927192, + "flos": 22639354941600.0, + "grad_norm": 2.005364396928138, + "language_loss": 0.72854787, + "learning_rate": 3.536807469251836e-06, + "loss": 0.75779444, + "num_input_tokens_seen": 43403825, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.18945312, + "step": 2033, + "time_per_iteration": 3.055988073348999 + }, + { + "auxiliary_loss_clip": 0.01559661, + "auxiliary_loss_mlp": 0.01351087, + "balance_loss_clip": 1.18353689, + "balance_loss_mlp": 1.03332257, + "epoch": 0.24457403955991103, + "flos": 21253749556320.0, + "grad_norm": 2.109737002730714, + "language_loss": 0.82699561, + "learning_rate": 3.5363088378040055e-06, + "loss": 0.85610312, + "num_input_tokens_seen": 43422715, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.17578125, + "step": 2034, + "time_per_iteration": 3.084439992904663 + }, + { + "auxiliary_loss_clip": 0.01599415, + "auxiliary_loss_mlp": 0.01268097, + "balance_loss_clip": 1.21985316, + "balance_loss_mlp": 1.00946045, + "epoch": 0.2446942824505501, + "flos": 67004176701600.0, + "grad_norm": 1.307550557177725, + "language_loss": 0.64345121, + "learning_rate": 3.5358099732993463e-06, + "loss": 0.67212641, + "num_input_tokens_seen": 43481825, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 2.5859375, + "step": 2035, + "time_per_iteration": 3.4709177017211914 + }, + { + "auxiliary_loss_clip": 0.01565698, + "auxiliary_loss_mlp": 0.01346703, + "balance_loss_clip": 1.19122183, + "balance_loss_mlp": 1.02836657, + "epoch": 0.2448145253411892, + "flos": 20413139366880.0, + "grad_norm": 2.3209065874039556, + "language_loss": 0.89351392, + "learning_rate": 3.535310875813535e-06, + "loss": 0.92263794, + "num_input_tokens_seen": 43500220, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.18164062, + "step": 2036, + "time_per_iteration": 3.012537717819214 + }, + { + "auxiliary_loss_clip": 0.01562925, + "auxiliary_loss_mlp": 0.01348579, + "balance_loss_clip": 1.18670988, + "balance_loss_mlp": 1.03424799, + "epoch": 0.2449347682318283, + "flos": 28807331819040.0, + "grad_norm": 2.1748912414965402, + "language_loss": 0.81758708, + "learning_rate": 3.5348115454222843e-06, + "loss": 0.8467021, + "num_input_tokens_seen": 43522805, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.140625, + "step": 2037, + "time_per_iteration": 3.043870449066162 + }, + { + "auxiliary_loss_clip": 0.01562503, + "auxiliary_loss_mlp": 0.01342111, + "balance_loss_clip": 1.18485677, + "balance_loss_mlp": 1.02949643, + "epoch": 0.2450550111224674, + "flos": 22531220664480.0, + "grad_norm": 2.1970420808583295, + "language_loss": 0.86328125, + "learning_rate": 3.5343119822013425e-06, + "loss": 0.89232737, + "num_input_tokens_seen": 43541915, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.12304688, + "step": 2038, + "time_per_iteration": 3.025242805480957 + }, + { + "auxiliary_loss_clip": 0.01570034, + "auxiliary_loss_mlp": 0.01362918, + "balance_loss_clip": 1.19408309, + "balance_loss_mlp": 1.04801488, + "epoch": 0.24517525401310647, + "flos": 21761119653120.0, + "grad_norm": 2.5927424969998403, + "language_loss": 0.77488339, + "learning_rate": 3.533812186226493e-06, + "loss": 0.80421293, + "num_input_tokens_seen": 43562625, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.14648438, + "step": 2039, + "time_per_iteration": 2.9698400497436523 + }, + { + "auxiliary_loss_clip": 0.01558868, + "auxiliary_loss_mlp": 0.01351137, + "balance_loss_clip": 1.18386018, + "balance_loss_mlp": 1.03928566, + "epoch": 0.24529549690374555, + "flos": 25045503528960.0, + "grad_norm": 2.191974192712098, + "language_loss": 0.75853431, + "learning_rate": 3.5333121575735545e-06, + "loss": 0.78763437, + "num_input_tokens_seen": 43582265, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.11523438, + "step": 2040, + "time_per_iteration": 2.995392322540283 + }, + { + "auxiliary_loss_clip": 0.01562885, + "auxiliary_loss_mlp": 0.01364022, + "balance_loss_clip": 1.1871016, + "balance_loss_mlp": 1.04854655, + "epoch": 0.24541573979438466, + "flos": 32126344469280.0, + "grad_norm": 4.200639689800712, + "language_loss": 0.76089823, + "learning_rate": 3.532811896318381e-06, + "loss": 0.79016733, + "num_input_tokens_seen": 43604335, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.15234375, + "step": 2041, + "time_per_iteration": 3.058692693710327 + }, + { + "auxiliary_loss_clip": 0.01565079, + "auxiliary_loss_mlp": 0.01361491, + "balance_loss_clip": 1.19062269, + "balance_loss_mlp": 1.04658747, + "epoch": 0.24553598268502375, + "flos": 31360225914720.0, + "grad_norm": 2.3523838195826703, + "language_loss": 0.81742513, + "learning_rate": 3.5323114025368615e-06, + "loss": 0.84669083, + "num_input_tokens_seen": 43619400, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.14648438, + "step": 2042, + "time_per_iteration": 3.0626680850982666 + }, + { + "auxiliary_loss_clip": 0.01557024, + "auxiliary_loss_mlp": 0.01360024, + "balance_loss_clip": 1.18295407, + "balance_loss_mlp": 1.04321337, + "epoch": 0.24565622557566283, + "flos": 14029197360480.0, + "grad_norm": 2.4873724328044395, + "language_loss": 0.81957889, + "learning_rate": 3.53181067630492e-06, + "loss": 0.8487494, + "num_input_tokens_seen": 43636870, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.16601562, + "step": 2043, + "time_per_iteration": 3.011246681213379 + }, + { + "auxiliary_loss_clip": 0.0155851, + "auxiliary_loss_mlp": 0.01349036, + "balance_loss_clip": 1.18402338, + "balance_loss_mlp": 1.03470504, + "epoch": 0.24577646846630194, + "flos": 16583343085440.0, + "grad_norm": 2.32525248630051, + "language_loss": 0.76430339, + "learning_rate": 3.5313097176985175e-06, + "loss": 0.79337883, + "num_input_tokens_seen": 43655180, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.140625, + "step": 2044, + "time_per_iteration": 2.903182029724121 + }, + { + "auxiliary_loss_clip": 0.01561593, + "auxiliary_loss_mlp": 0.01341317, + "balance_loss_clip": 1.18506682, + "balance_loss_mlp": 1.02908397, + "epoch": 0.24589671135694102, + "flos": 18809444875680.0, + "grad_norm": 3.4132495021841076, + "language_loss": 0.81361371, + "learning_rate": 3.5308085267936482e-06, + "loss": 0.84264278, + "num_input_tokens_seen": 43672895, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.11914062, + "step": 2045, + "time_per_iteration": 2.9180362224578857 + }, + { + "auxiliary_loss_clip": 0.01560207, + "auxiliary_loss_mlp": 0.01346779, + "balance_loss_clip": 1.18597841, + "balance_loss_mlp": 1.03149414, + "epoch": 0.2460169542475801, + "flos": 19940625613440.0, + "grad_norm": 1.74765702266491, + "language_loss": 0.90349466, + "learning_rate": 3.530307103666342e-06, + "loss": 0.9325645, + "num_input_tokens_seen": 43691975, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.15039062, + "step": 2046, + "time_per_iteration": 2.994595527648926 + }, + { + "auxiliary_loss_clip": 0.01559387, + "auxiliary_loss_mlp": 0.01357033, + "balance_loss_clip": 1.1826942, + "balance_loss_mlp": 1.04136693, + "epoch": 0.24613719713821922, + "flos": 24173564315040.0, + "grad_norm": 3.7459227881629107, + "language_loss": 0.80454493, + "learning_rate": 3.5298054483926658e-06, + "loss": 0.83370918, + "num_input_tokens_seen": 43712670, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 3.15429688, + "step": 2047, + "time_per_iteration": 2.978963613510132 + }, + { + "auxiliary_loss_clip": 0.01562588, + "auxiliary_loss_mlp": 0.01366481, + "balance_loss_clip": 1.18452513, + "balance_loss_mlp": 1.05024302, + "epoch": 0.2462574400288583, + "flos": 30223355952960.0, + "grad_norm": 2.5051224083893744, + "language_loss": 0.83131546, + "learning_rate": 3.5293035610487187e-06, + "loss": 0.86060613, + "num_input_tokens_seen": 43732035, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 3.16015625, + "step": 2048, + "time_per_iteration": 3.0481412410736084 + }, + { + "auxiliary_loss_clip": 0.01584485, + "auxiliary_loss_mlp": 0.01254654, + "balance_loss_clip": 1.20441353, + "balance_loss_mlp": 1.00364685, + "epoch": 0.24637768291949738, + "flos": 68950138187520.0, + "grad_norm": 0.7559300367956715, + "language_loss": 0.61939675, + "learning_rate": 3.5288014417106374e-06, + "loss": 0.64778811, + "num_input_tokens_seen": 43798055, + "router_z_loss_clip": 3.796875, + "router_z_loss_mlp": 2.5078125, + "step": 2049, + "time_per_iteration": 3.4881815910339355 + }, + { + "auxiliary_loss_clip": 0.01558207, + "auxiliary_loss_mlp": 0.01351292, + "balance_loss_clip": 1.18220532, + "balance_loss_mlp": 1.03467178, + "epoch": 0.24649792581013646, + "flos": 34386733680480.0, + "grad_norm": 2.0672169883417246, + "language_loss": 0.75547713, + "learning_rate": 3.528299090454593e-06, + "loss": 0.78457212, + "num_input_tokens_seen": 43818590, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.1640625, + "step": 2050, + "time_per_iteration": 3.8278238773345947 + }, + { + "auxiliary_loss_clip": 0.01559409, + "auxiliary_loss_mlp": 0.01361029, + "balance_loss_clip": 1.1821003, + "balance_loss_mlp": 1.04307413, + "epoch": 0.24661816870077558, + "flos": 19682370221760.0, + "grad_norm": 4.614309003676248, + "language_loss": 0.8252157, + "learning_rate": 3.527796507356792e-06, + "loss": 0.85442007, + "num_input_tokens_seen": 43832480, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.17773438, + "step": 2051, + "time_per_iteration": 3.8371505737304688 + }, + { + "auxiliary_loss_clip": 0.01558647, + "auxiliary_loss_mlp": 0.01357847, + "balance_loss_clip": 1.18211472, + "balance_loss_mlp": 1.04065514, + "epoch": 0.24673841159141466, + "flos": 20003852584800.0, + "grad_norm": 2.7561745925393644, + "language_loss": 0.89762568, + "learning_rate": 3.527293692493475e-06, + "loss": 0.92679065, + "num_input_tokens_seen": 43848345, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.16992188, + "step": 2052, + "time_per_iteration": 3.0993711948394775 + }, + { + "auxiliary_loss_clip": 0.01564334, + "auxiliary_loss_mlp": 0.01357876, + "balance_loss_clip": 1.18750739, + "balance_loss_mlp": 1.03801346, + "epoch": 0.24685865448205374, + "flos": 21648396068640.0, + "grad_norm": 2.643750105076044, + "language_loss": 0.73707992, + "learning_rate": 3.52679064594092e-06, + "loss": 0.76630199, + "num_input_tokens_seen": 43865685, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.19726562, + "step": 2053, + "time_per_iteration": 2.944638252258301 + }, + { + "auxiliary_loss_clip": 0.01563528, + "auxiliary_loss_mlp": 0.01349613, + "balance_loss_clip": 1.18736315, + "balance_loss_mlp": 1.03528178, + "epoch": 0.24697889737269285, + "flos": 17962007617440.0, + "grad_norm": 4.017351128709233, + "language_loss": 0.74617863, + "learning_rate": 3.5262873677754375e-06, + "loss": 0.77531004, + "num_input_tokens_seen": 43883690, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.140625, + "step": 2054, + "time_per_iteration": 3.0905747413635254 + }, + { + "auxiliary_loss_clip": 0.01563234, + "auxiliary_loss_mlp": 0.01351009, + "balance_loss_clip": 1.18696833, + "balance_loss_mlp": 1.03324521, + "epoch": 0.24709914026333193, + "flos": 27347083086240.0, + "grad_norm": 1.8221417247421026, + "language_loss": 0.80882049, + "learning_rate": 3.5257838580733745e-06, + "loss": 0.83796287, + "num_input_tokens_seen": 43903295, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.17578125, + "step": 2055, + "time_per_iteration": 3.930751323699951 + }, + { + "auxiliary_loss_clip": 0.01568516, + "auxiliary_loss_mlp": 0.01343317, + "balance_loss_clip": 1.19252241, + "balance_loss_mlp": 1.02688766, + "epoch": 0.24721938315397102, + "flos": 19277217609120.0, + "grad_norm": 2.3734309970020004, + "language_loss": 0.87740749, + "learning_rate": 3.5252801169111138e-06, + "loss": 0.90652585, + "num_input_tokens_seen": 43920960, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.16210938, + "step": 2056, + "time_per_iteration": 2.9346635341644287 + }, + { + "auxiliary_loss_clip": 0.01567981, + "auxiliary_loss_mlp": 0.01359049, + "balance_loss_clip": 1.19321489, + "balance_loss_mlp": 1.04185677, + "epoch": 0.2473396260446101, + "flos": 23188067097120.0, + "grad_norm": 1.855900108373888, + "language_loss": 0.80170906, + "learning_rate": 3.524776144365072e-06, + "loss": 0.83097929, + "num_input_tokens_seen": 43939415, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.16992188, + "step": 2057, + "time_per_iteration": 3.0649526119232178 + }, + { + "auxiliary_loss_clip": 0.01572542, + "auxiliary_loss_mlp": 0.01344571, + "balance_loss_clip": 1.19739246, + "balance_loss_mlp": 1.02947736, + "epoch": 0.2474598689352492, + "flos": 21144591218880.0, + "grad_norm": 2.9781432225862563, + "language_loss": 0.78887463, + "learning_rate": 3.5242719405117016e-06, + "loss": 0.81804574, + "num_input_tokens_seen": 43959220, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.1484375, + "step": 2058, + "time_per_iteration": 3.8212082386016846 + }, + { + "auxiliary_loss_clip": 0.0156639, + "auxiliary_loss_mlp": 0.01366968, + "balance_loss_clip": 1.18968344, + "balance_loss_mlp": 1.04805911, + "epoch": 0.2475801118258883, + "flos": 21650254548480.0, + "grad_norm": 2.645323283432544, + "language_loss": 0.75519371, + "learning_rate": 3.5237675054274893e-06, + "loss": 0.7845273, + "num_input_tokens_seen": 43978420, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.1875, + "step": 2059, + "time_per_iteration": 2.937603712081909 + }, + { + "auxiliary_loss_clip": 0.01569126, + "auxiliary_loss_mlp": 0.01364495, + "balance_loss_clip": 1.1924262, + "balance_loss_mlp": 1.04158068, + "epoch": 0.24770035471652738, + "flos": 22676714543520.0, + "grad_norm": 1.9987477287905946, + "language_loss": 0.7987296, + "learning_rate": 3.5232628391889584e-06, + "loss": 0.82806581, + "num_input_tokens_seen": 43996710, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.22851562, + "step": 2060, + "time_per_iteration": 2.9862558841705322 + }, + { + "auxiliary_loss_clip": 0.01562337, + "auxiliary_loss_mlp": 0.01362113, + "balance_loss_clip": 1.1876893, + "balance_loss_mlp": 1.04740036, + "epoch": 0.2478205976071665, + "flos": 22166006768640.0, + "grad_norm": 2.4878173174482967, + "language_loss": 0.64574945, + "learning_rate": 3.522757941872666e-06, + "loss": 0.67499399, + "num_input_tokens_seen": 44014865, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.14453125, + "step": 2061, + "time_per_iteration": 2.933014154434204 + }, + { + "auxiliary_loss_clip": 0.01569755, + "auxiliary_loss_mlp": 0.01357207, + "balance_loss_clip": 1.19249177, + "balance_loss_mlp": 1.03982401, + "epoch": 0.24794084049780557, + "flos": 24975449488800.0, + "grad_norm": 1.6180397696993043, + "language_loss": 0.82445991, + "learning_rate": 3.5222528135552042e-06, + "loss": 0.85372949, + "num_input_tokens_seen": 44036325, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.171875, + "step": 2062, + "time_per_iteration": 3.0158185958862305 + }, + { + "auxiliary_loss_clip": 0.01568854, + "auxiliary_loss_mlp": 0.01365879, + "balance_loss_clip": 1.19175386, + "balance_loss_mlp": 1.0479238, + "epoch": 0.24806108338844465, + "flos": 18298357819200.0, + "grad_norm": 2.000982986148212, + "language_loss": 0.80926549, + "learning_rate": 3.521747454313201e-06, + "loss": 0.83861279, + "num_input_tokens_seen": 44055005, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.17773438, + "step": 2063, + "time_per_iteration": 2.985050916671753 + }, + { + "auxiliary_loss_clip": 0.01564348, + "auxiliary_loss_mlp": 0.01351614, + "balance_loss_clip": 1.18648076, + "balance_loss_mlp": 1.03156126, + "epoch": 0.24818132627908374, + "flos": 19284234318720.0, + "grad_norm": 2.3032503730342544, + "language_loss": 0.66834092, + "learning_rate": 3.521241864223319e-06, + "loss": 0.69750053, + "num_input_tokens_seen": 44073965, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.19921875, + "step": 2064, + "time_per_iteration": 2.901639461517334 + }, + { + "auxiliary_loss_clip": 0.01595492, + "auxiliary_loss_mlp": 0.0128698, + "balance_loss_clip": 1.21703172, + "balance_loss_mlp": 1.02910614, + "epoch": 0.24830156916972285, + "flos": 70292618890560.0, + "grad_norm": 0.7954426671889927, + "language_loss": 0.61892414, + "learning_rate": 3.5207360433622552e-06, + "loss": 0.64774883, + "num_input_tokens_seen": 44135965, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 2.578125, + "step": 2065, + "time_per_iteration": 3.418785810470581 + }, + { + "auxiliary_loss_clip": 0.01569486, + "auxiliary_loss_mlp": 0.01351778, + "balance_loss_clip": 1.19352818, + "balance_loss_mlp": 1.03611147, + "epoch": 0.24842181206036193, + "flos": 40412478500640.0, + "grad_norm": 7.261022135366195, + "language_loss": 0.7465865, + "learning_rate": 3.5202299918067437e-06, + "loss": 0.77579916, + "num_input_tokens_seen": 44159560, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.15429688, + "step": 2066, + "time_per_iteration": 3.1431100368499756 + }, + { + "auxiliary_loss_clip": 0.01568547, + "auxiliary_loss_mlp": 0.01342017, + "balance_loss_clip": 1.19227624, + "balance_loss_mlp": 1.02940285, + "epoch": 0.248542054951001, + "flos": 20084412725280.0, + "grad_norm": 2.3284310549987772, + "language_loss": 0.69292629, + "learning_rate": 3.519723709633551e-06, + "loss": 0.72203183, + "num_input_tokens_seen": 44178320, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.12304688, + "step": 2067, + "time_per_iteration": 2.9387824535369873 + }, + { + "auxiliary_loss_clip": 0.01575083, + "auxiliary_loss_mlp": 0.01356456, + "balance_loss_clip": 1.19999576, + "balance_loss_mlp": 1.04174316, + "epoch": 0.24866229784164012, + "flos": 23516224816320.0, + "grad_norm": 2.1163124549636345, + "language_loss": 0.83738935, + "learning_rate": 3.519217196919479e-06, + "loss": 0.8667047, + "num_input_tokens_seen": 44197305, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 3.14453125, + "step": 2068, + "time_per_iteration": 2.9957051277160645 + }, + { + "auxiliary_loss_clip": 0.01572127, + "auxiliary_loss_mlp": 0.01360205, + "balance_loss_clip": 1.19704676, + "balance_loss_mlp": 1.04472971, + "epoch": 0.2487825407322792, + "flos": 19867158038880.0, + "grad_norm": 2.526725596385691, + "language_loss": 0.73806417, + "learning_rate": 3.518710453741367e-06, + "loss": 0.76738745, + "num_input_tokens_seen": 44216505, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.15234375, + "step": 2069, + "time_per_iteration": 3.0015504360198975 + }, + { + "auxiliary_loss_clip": 0.01568333, + "auxiliary_loss_mlp": 0.01343575, + "balance_loss_clip": 1.19257116, + "balance_loss_mlp": 1.02886283, + "epoch": 0.2489027836229183, + "flos": 22019792254560.0, + "grad_norm": 3.3918109208281524, + "language_loss": 0.67289472, + "learning_rate": 3.518203480176086e-06, + "loss": 0.70201373, + "num_input_tokens_seen": 44235435, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.14453125, + "step": 2070, + "time_per_iteration": 2.9982619285583496 + }, + { + "auxiliary_loss_clip": 0.01564751, + "auxiliary_loss_mlp": 0.01349911, + "balance_loss_clip": 1.18875444, + "balance_loss_mlp": 1.0363431, + "epoch": 0.2490230265135574, + "flos": 23296580655840.0, + "grad_norm": 1.7443963844941037, + "language_loss": 0.80846095, + "learning_rate": 3.517696276300545e-06, + "loss": 0.83760762, + "num_input_tokens_seen": 44256975, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.1328125, + "step": 2071, + "time_per_iteration": 3.005362033843994 + }, + { + "auxiliary_loss_clip": 0.01572718, + "auxiliary_loss_mlp": 0.01341394, + "balance_loss_clip": 1.19702482, + "balance_loss_mlp": 1.02820778, + "epoch": 0.24914326940419648, + "flos": 19828736448480.0, + "grad_norm": 2.797506882874469, + "language_loss": 0.69588912, + "learning_rate": 3.517188842191685e-06, + "loss": 0.72503024, + "num_input_tokens_seen": 44275125, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.12890625, + "step": 2072, + "time_per_iteration": 2.973576068878174 + }, + { + "auxiliary_loss_clip": 0.01561344, + "auxiliary_loss_mlp": 0.01353394, + "balance_loss_clip": 1.1834681, + "balance_loss_mlp": 1.03791881, + "epoch": 0.24926351229483557, + "flos": 20231309946240.0, + "grad_norm": 1.604337959811367, + "language_loss": 0.74212337, + "learning_rate": 3.5166811779264837e-06, + "loss": 0.77127075, + "num_input_tokens_seen": 44295445, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.15234375, + "step": 2073, + "time_per_iteration": 3.0076980590820312 + }, + { + "auxiliary_loss_clip": 0.01563418, + "auxiliary_loss_mlp": 0.0135288, + "balance_loss_clip": 1.18672872, + "balance_loss_mlp": 1.03854871, + "epoch": 0.24938375518547465, + "flos": 23296808224800.0, + "grad_norm": 1.9918013760191595, + "language_loss": 0.7797612, + "learning_rate": 3.5161732835819545e-06, + "loss": 0.80892414, + "num_input_tokens_seen": 44314755, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.140625, + "step": 2074, + "time_per_iteration": 2.9411561489105225 + }, + { + "auxiliary_loss_clip": 0.01565579, + "auxiliary_loss_mlp": 0.01351575, + "balance_loss_clip": 1.18950605, + "balance_loss_mlp": 1.03838849, + "epoch": 0.24950399807611376, + "flos": 17313619164480.0, + "grad_norm": 2.631139465198587, + "language_loss": 0.83453107, + "learning_rate": 3.515665159235143e-06, + "loss": 0.8637026, + "num_input_tokens_seen": 44333640, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.12890625, + "step": 2075, + "time_per_iteration": 3.0349786281585693 + }, + { + "auxiliary_loss_clip": 0.01568904, + "auxiliary_loss_mlp": 0.01344896, + "balance_loss_clip": 1.19182324, + "balance_loss_mlp": 1.03113675, + "epoch": 0.24962424096675284, + "flos": 19026813346560.0, + "grad_norm": 2.057832618047515, + "language_loss": 0.74905562, + "learning_rate": 3.5151568049631318e-06, + "loss": 0.77819359, + "num_input_tokens_seen": 44352355, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 3.13476562, + "step": 2076, + "time_per_iteration": 3.089301347732544 + }, + { + "auxiliary_loss_clip": 0.01563465, + "auxiliary_loss_mlp": 0.01353543, + "balance_loss_clip": 1.18706536, + "balance_loss_mlp": 1.0399754, + "epoch": 0.24974448385739192, + "flos": 33401464031520.0, + "grad_norm": 1.8952235431522861, + "language_loss": 0.80465627, + "learning_rate": 3.5146482208430385e-06, + "loss": 0.8338263, + "num_input_tokens_seen": 44374185, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 3.1328125, + "step": 2077, + "time_per_iteration": 4.7955241203308105 + }, + { + "auxiliary_loss_clip": 0.01559153, + "auxiliary_loss_mlp": 0.01342251, + "balance_loss_clip": 1.18311942, + "balance_loss_mlp": 1.02734792, + "epoch": 0.24986472674803104, + "flos": 30009856154400.0, + "grad_norm": 2.2031402126243402, + "language_loss": 0.67895871, + "learning_rate": 3.514139406952014e-06, + "loss": 0.70797276, + "num_input_tokens_seen": 44396210, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.14648438, + "step": 2078, + "time_per_iteration": 3.09158992767334 + }, + { + "auxiliary_loss_clip": 0.01568672, + "auxiliary_loss_mlp": 0.01350135, + "balance_loss_clip": 1.19224215, + "balance_loss_mlp": 1.03675723, + "epoch": 0.24998496963867012, + "flos": 26615403665280.0, + "grad_norm": 2.1807010466019623, + "language_loss": 0.83340049, + "learning_rate": 3.5136303633672454e-06, + "loss": 0.86258858, + "num_input_tokens_seen": 44416340, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.13085938, + "step": 2079, + "time_per_iteration": 3.0025722980499268 + }, + { + "auxiliary_loss_clip": 0.01569059, + "auxiliary_loss_mlp": 0.01350299, + "balance_loss_clip": 1.19159269, + "balance_loss_mlp": 1.03730273, + "epoch": 0.25010521252930923, + "flos": 23556466958400.0, + "grad_norm": 1.7182458999726387, + "language_loss": 0.74789655, + "learning_rate": 3.5131210901659544e-06, + "loss": 0.77709013, + "num_input_tokens_seen": 44438095, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.12695312, + "step": 2080, + "time_per_iteration": 3.0224289894104004 + }, + { + "auxiliary_loss_clip": 0.01566732, + "auxiliary_loss_mlp": 0.01347328, + "balance_loss_clip": 1.18928027, + "balance_loss_mlp": 1.03528583, + "epoch": 0.2502254554199483, + "flos": 23443477876800.0, + "grad_norm": 3.079027266657781, + "language_loss": 0.82219589, + "learning_rate": 3.5126115874253967e-06, + "loss": 0.85133648, + "num_input_tokens_seen": 44457650, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.1171875, + "step": 2081, + "time_per_iteration": 3.15853214263916 + }, + { + "auxiliary_loss_clip": 0.01566811, + "auxiliary_loss_mlp": 0.01351859, + "balance_loss_clip": 1.18937182, + "balance_loss_mlp": 1.03714657, + "epoch": 0.2503456983105874, + "flos": 28763903711520.0, + "grad_norm": 2.9653663785396107, + "language_loss": 0.81137514, + "learning_rate": 3.5121018552228644e-06, + "loss": 0.84056187, + "num_input_tokens_seen": 44476155, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 3.14453125, + "step": 2082, + "time_per_iteration": 3.0440573692321777 + }, + { + "auxiliary_loss_clip": 0.01566459, + "auxiliary_loss_mlp": 0.01345127, + "balance_loss_clip": 1.18788052, + "balance_loss_mlp": 1.03155863, + "epoch": 0.2504659412012265, + "flos": 18772236986400.0, + "grad_norm": 2.6914451629792415, + "language_loss": 0.76506925, + "learning_rate": 3.5115918936356827e-06, + "loss": 0.7941851, + "num_input_tokens_seen": 44492910, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 3.1328125, + "step": 2083, + "time_per_iteration": 3.741178274154663 + }, + { + "auxiliary_loss_clip": 0.01577743, + "auxiliary_loss_mlp": 0.01342174, + "balance_loss_clip": 1.20242643, + "balance_loss_mlp": 1.03070378, + "epoch": 0.25058618409186556, + "flos": 16875279047520.0, + "grad_norm": 1.981816359178496, + "language_loss": 0.7865231, + "learning_rate": 3.5110817027412123e-06, + "loss": 0.81572229, + "num_input_tokens_seen": 44512000, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.11132812, + "step": 2084, + "time_per_iteration": 2.945803165435791 + }, + { + "auxiliary_loss_clip": 0.01568015, + "auxiliary_loss_mlp": 0.01341931, + "balance_loss_clip": 1.19024312, + "balance_loss_mlp": 1.02836311, + "epoch": 0.25070642698250467, + "flos": 24427837249920.0, + "grad_norm": 1.934090906060545, + "language_loss": 0.68669355, + "learning_rate": 3.5105712826168493e-06, + "loss": 0.71579301, + "num_input_tokens_seen": 44531650, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.1328125, + "step": 2085, + "time_per_iteration": 2.9933063983917236 + }, + { + "auxiliary_loss_clip": 0.01569292, + "auxiliary_loss_mlp": 0.01352066, + "balance_loss_clip": 1.19250202, + "balance_loss_mlp": 1.04059601, + "epoch": 0.2508266698731437, + "flos": 20262714827040.0, + "grad_norm": 2.527985728589024, + "language_loss": 0.70610929, + "learning_rate": 3.5100606333400235e-06, + "loss": 0.73532289, + "num_input_tokens_seen": 44548785, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.11132812, + "step": 2086, + "time_per_iteration": 3.8502097129821777 + }, + { + "auxiliary_loss_clip": 0.01565845, + "auxiliary_loss_mlp": 0.01353511, + "balance_loss_clip": 1.18895972, + "balance_loss_mlp": 1.03650928, + "epoch": 0.25094691276378284, + "flos": 19247367782880.0, + "grad_norm": 2.253008752525353, + "language_loss": 0.77457047, + "learning_rate": 3.5095497549882006e-06, + "loss": 0.8037641, + "num_input_tokens_seen": 44567230, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.16796875, + "step": 2087, + "time_per_iteration": 3.0282411575317383 + }, + { + "auxiliary_loss_clip": 0.01574473, + "auxiliary_loss_mlp": 0.01349038, + "balance_loss_clip": 1.1973772, + "balance_loss_mlp": 1.03604257, + "epoch": 0.25106715565442195, + "flos": 26945685361440.0, + "grad_norm": 3.170853761288832, + "language_loss": 0.73033959, + "learning_rate": 3.50903864763888e-06, + "loss": 0.75957471, + "num_input_tokens_seen": 44588020, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.12695312, + "step": 2088, + "time_per_iteration": 3.0228750705718994 + }, + { + "auxiliary_loss_clip": 0.01566277, + "auxiliary_loss_mlp": 0.01365106, + "balance_loss_clip": 1.18962729, + "balance_loss_mlp": 1.0503937, + "epoch": 0.251187398545061, + "flos": 48363172606080.0, + "grad_norm": 1.9409009724096256, + "language_loss": 0.76195943, + "learning_rate": 3.5085273113695965e-06, + "loss": 0.7912733, + "num_input_tokens_seen": 44612590, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 3.14453125, + "step": 2089, + "time_per_iteration": 3.167445421218872 + }, + { + "auxiliary_loss_clip": 0.01566301, + "auxiliary_loss_mlp": 0.01344507, + "balance_loss_clip": 1.19068432, + "balance_loss_mlp": 1.03112948, + "epoch": 0.2513076414357001, + "flos": 27018546085440.0, + "grad_norm": 1.8799683914684793, + "language_loss": 0.78593767, + "learning_rate": 3.508015746257919e-06, + "loss": 0.81504571, + "num_input_tokens_seen": 44631630, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.13085938, + "step": 2090, + "time_per_iteration": 3.047248363494873 + }, + { + "auxiliary_loss_clip": 0.01567688, + "auxiliary_loss_mlp": 0.01355585, + "balance_loss_clip": 1.19312394, + "balance_loss_mlp": 1.04010928, + "epoch": 0.2514278843263392, + "flos": 19465646529600.0, + "grad_norm": 2.78860752597157, + "language_loss": 0.83094466, + "learning_rate": 3.5075039523814518e-06, + "loss": 0.8601774, + "num_input_tokens_seen": 44650820, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.15234375, + "step": 2091, + "time_per_iteration": 2.9893834590911865 + }, + { + "auxiliary_loss_clip": 0.01559928, + "auxiliary_loss_mlp": 0.01358826, + "balance_loss_clip": 1.18516362, + "balance_loss_mlp": 1.04163444, + "epoch": 0.2515481272169783, + "flos": 16867579631040.0, + "grad_norm": 2.1838440343870342, + "language_loss": 0.81781304, + "learning_rate": 3.506991929817834e-06, + "loss": 0.84700054, + "num_input_tokens_seen": 44667540, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.16992188, + "step": 2092, + "time_per_iteration": 3.0268971920013428 + }, + { + "auxiliary_loss_clip": 0.0157339, + "auxiliary_loss_mlp": 0.01336241, + "balance_loss_clip": 1.19583869, + "balance_loss_mlp": 1.02801323, + "epoch": 0.2516683701076174, + "flos": 23734731132000.0, + "grad_norm": 3.91433221649219, + "language_loss": 0.82689464, + "learning_rate": 3.506479678644738e-06, + "loss": 0.85599101, + "num_input_tokens_seen": 44687935, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 3.078125, + "step": 2093, + "time_per_iteration": 3.0668773651123047 + }, + { + "auxiliary_loss_clip": 0.0155951, + "auxiliary_loss_mlp": 0.01353224, + "balance_loss_clip": 1.18436408, + "balance_loss_mlp": 1.04003763, + "epoch": 0.2517886129982565, + "flos": 27638412197760.0, + "grad_norm": 3.8076942873699937, + "language_loss": 0.74846035, + "learning_rate": 3.505967198939873e-06, + "loss": 0.77758771, + "num_input_tokens_seen": 44704975, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.12890625, + "step": 2094, + "time_per_iteration": 2.987976551055908 + }, + { + "auxiliary_loss_clip": 0.01553865, + "auxiliary_loss_mlp": 0.01351939, + "balance_loss_clip": 1.1785574, + "balance_loss_mlp": 1.03646362, + "epoch": 0.25190885588889556, + "flos": 38107485408960.0, + "grad_norm": 2.1287420389995155, + "language_loss": 0.78255928, + "learning_rate": 3.5054544907809813e-06, + "loss": 0.81161726, + "num_input_tokens_seen": 44725475, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.15234375, + "step": 2095, + "time_per_iteration": 3.1517748832702637 + }, + { + "auxiliary_loss_clip": 0.01565927, + "auxiliary_loss_mlp": 0.01344097, + "balance_loss_clip": 1.19044375, + "balance_loss_mlp": 1.0311017, + "epoch": 0.25202909877953467, + "flos": 22271903284320.0, + "grad_norm": 2.102567512509243, + "language_loss": 0.80550301, + "learning_rate": 3.50494155424584e-06, + "loss": 0.83460331, + "num_input_tokens_seen": 44744380, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.12695312, + "step": 2096, + "time_per_iteration": 3.0442004203796387 + }, + { + "auxiliary_loss_clip": 0.01569941, + "auxiliary_loss_mlp": 0.01341175, + "balance_loss_clip": 1.19509566, + "balance_loss_mlp": 1.03046799, + "epoch": 0.2521493416701738, + "flos": 21763888408800.0, + "grad_norm": 1.7188260907704613, + "language_loss": 0.82994401, + "learning_rate": 3.504428389412262e-06, + "loss": 0.85905522, + "num_input_tokens_seen": 44765190, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.10351562, + "step": 2097, + "time_per_iteration": 3.0128636360168457 + }, + { + "auxiliary_loss_clip": 0.01569688, + "auxiliary_loss_mlp": 0.01350218, + "balance_loss_clip": 1.19316471, + "balance_loss_mlp": 1.0366497, + "epoch": 0.25226958456081283, + "flos": 27749277302400.0, + "grad_norm": 1.9908056219765307, + "language_loss": 0.73195148, + "learning_rate": 3.5039149963580927e-06, + "loss": 0.7611506, + "num_input_tokens_seen": 44785210, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 3.1328125, + "step": 2098, + "time_per_iteration": 3.0787243843078613 + }, + { + "auxiliary_loss_clip": 0.0156712, + "auxiliary_loss_mlp": 0.01351295, + "balance_loss_clip": 1.19283819, + "balance_loss_mlp": 1.03467512, + "epoch": 0.25238982745145194, + "flos": 30734518865760.0, + "grad_norm": 2.5043855663595225, + "language_loss": 0.70210552, + "learning_rate": 3.503401375161215e-06, + "loss": 0.73128963, + "num_input_tokens_seen": 44804955, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.1640625, + "step": 2099, + "time_per_iteration": 3.0158772468566895 + }, + { + "auxiliary_loss_clip": 0.01564272, + "auxiliary_loss_mlp": 0.01332345, + "balance_loss_clip": 1.18925524, + "balance_loss_mlp": 1.0229733, + "epoch": 0.252510070342091, + "flos": 20268404051040.0, + "grad_norm": 1.5877609941076296, + "language_loss": 0.83878696, + "learning_rate": 3.502887525899544e-06, + "loss": 0.86775315, + "num_input_tokens_seen": 44823935, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.08984375, + "step": 2100, + "time_per_iteration": 2.989412307739258 + }, + { + "auxiliary_loss_clip": 0.01557618, + "auxiliary_loss_mlp": 0.01347957, + "balance_loss_clip": 1.18279576, + "balance_loss_mlp": 1.0336262, + "epoch": 0.2526303132327301, + "flos": 22749613195680.0, + "grad_norm": 2.4274001432209653, + "language_loss": 0.83065051, + "learning_rate": 3.50237344865103e-06, + "loss": 0.85970622, + "num_input_tokens_seen": 44844935, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.140625, + "step": 2101, + "time_per_iteration": 3.0966989994049072 + }, + { + "auxiliary_loss_clip": 0.01557327, + "auxiliary_loss_mlp": 0.01345858, + "balance_loss_clip": 1.18228257, + "balance_loss_mlp": 1.0315268, + "epoch": 0.2527505561233692, + "flos": 30266101353600.0, + "grad_norm": 3.0341018134995688, + "language_loss": 0.76062357, + "learning_rate": 3.501859143493658e-06, + "loss": 0.78965545, + "num_input_tokens_seen": 44865565, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.140625, + "step": 2102, + "time_per_iteration": 3.0586788654327393 + }, + { + "auxiliary_loss_clip": 0.01624036, + "auxiliary_loss_mlp": 0.01252762, + "balance_loss_clip": 1.2435627, + "balance_loss_mlp": 1.00328064, + "epoch": 0.2528707990140083, + "flos": 58498588150560.0, + "grad_norm": 0.9130150403641671, + "language_loss": 0.60499835, + "learning_rate": 3.5013446105054488e-06, + "loss": 0.63376641, + "num_input_tokens_seen": 44918485, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 2.4921875, + "step": 2103, + "time_per_iteration": 3.27120041847229 + }, + { + "auxiliary_loss_clip": 0.01560226, + "auxiliary_loss_mlp": 0.0135168, + "balance_loss_clip": 1.18651485, + "balance_loss_mlp": 1.03906512, + "epoch": 0.2529910419046474, + "flos": 24647481410400.0, + "grad_norm": 2.0525952809058645, + "language_loss": 0.74881655, + "learning_rate": 3.5008298497644555e-06, + "loss": 0.77793562, + "num_input_tokens_seen": 44937530, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.12304688, + "step": 2104, + "time_per_iteration": 3.856114625930786 + }, + { + "auxiliary_loss_clip": 0.01559143, + "auxiliary_loss_mlp": 0.01345663, + "balance_loss_clip": 1.1835314, + "balance_loss_mlp": 1.03152251, + "epoch": 0.2531112847952865, + "flos": 23844420463680.0, + "grad_norm": 1.5733512534169458, + "language_loss": 0.88093781, + "learning_rate": 3.500314861348767e-06, + "loss": 0.9099859, + "num_input_tokens_seen": 44958165, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.13867188, + "step": 2105, + "time_per_iteration": 3.953150510787964 + }, + { + "auxiliary_loss_clip": 0.01556973, + "auxiliary_loss_mlp": 0.01357831, + "balance_loss_clip": 1.18349659, + "balance_loss_mlp": 1.04426265, + "epoch": 0.25323152768592555, + "flos": 16145685675360.0, + "grad_norm": 2.2218556415648623, + "language_loss": 0.77423602, + "learning_rate": 3.499799645336507e-06, + "loss": 0.80338407, + "num_input_tokens_seen": 44975060, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 3.1328125, + "step": 2106, + "time_per_iteration": 3.0147206783294678 + }, + { + "auxiliary_loss_clip": 0.01561941, + "auxiliary_loss_mlp": 0.01342241, + "balance_loss_clip": 1.18614531, + "balance_loss_mlp": 1.02962613, + "epoch": 0.25335177057656466, + "flos": 28407830502240.0, + "grad_norm": 1.3973999151058414, + "language_loss": 0.87196171, + "learning_rate": 3.4992842018058336e-06, + "loss": 0.90100354, + "num_input_tokens_seen": 44997960, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.12304688, + "step": 2107, + "time_per_iteration": 3.0189523696899414 + }, + { + "auxiliary_loss_clip": 0.01556831, + "auxiliary_loss_mlp": 0.01335021, + "balance_loss_clip": 1.18284822, + "balance_loss_mlp": 1.02202535, + "epoch": 0.25347201346720377, + "flos": 18801252393120.0, + "grad_norm": 2.169782782787866, + "language_loss": 0.88276565, + "learning_rate": 3.4987685308349384e-06, + "loss": 0.91168416, + "num_input_tokens_seen": 45015690, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.12695312, + "step": 2108, + "time_per_iteration": 2.9913523197174072 + }, + { + "auxiliary_loss_clip": 0.01554997, + "auxiliary_loss_mlp": 0.01350502, + "balance_loss_clip": 1.18010092, + "balance_loss_mlp": 1.03807831, + "epoch": 0.2535922563578428, + "flos": 15817338315360.0, + "grad_norm": 2.7213870944079406, + "language_loss": 0.61780417, + "learning_rate": 3.4982526325020497e-06, + "loss": 0.64685917, + "num_input_tokens_seen": 45032660, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.12109375, + "step": 2109, + "time_per_iteration": 2.984173536300659 + }, + { + "auxiliary_loss_clip": 0.01561628, + "auxiliary_loss_mlp": 0.01355096, + "balance_loss_clip": 1.18527246, + "balance_loss_mlp": 1.04210019, + "epoch": 0.25371249924848194, + "flos": 16320384601920.0, + "grad_norm": 2.380043737723406, + "language_loss": 0.82108939, + "learning_rate": 3.4977365068854273e-06, + "loss": 0.85025668, + "num_input_tokens_seen": 45048280, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.12695312, + "step": 2110, + "time_per_iteration": 3.7778267860412598 + }, + { + "auxiliary_loss_clip": 0.01558824, + "auxiliary_loss_mlp": 0.0134862, + "balance_loss_clip": 1.18156159, + "balance_loss_mlp": 1.03676891, + "epoch": 0.25383274213912105, + "flos": 21763736696160.0, + "grad_norm": 1.8562171360857938, + "language_loss": 0.73574609, + "learning_rate": 3.4972201540633676e-06, + "loss": 0.76482052, + "num_input_tokens_seen": 45067635, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.11523438, + "step": 2111, + "time_per_iteration": 2.9889166355133057 + }, + { + "auxiliary_loss_clip": 0.01559959, + "auxiliary_loss_mlp": 0.01351862, + "balance_loss_clip": 1.18572688, + "balance_loss_mlp": 1.04229927, + "epoch": 0.2539529850297601, + "flos": 21398977938240.0, + "grad_norm": 2.0177174395625244, + "language_loss": 0.85483968, + "learning_rate": 3.4967035741142008e-06, + "loss": 0.88395792, + "num_input_tokens_seen": 45086455, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.09179688, + "step": 2112, + "time_per_iteration": 3.023195266723633 + }, + { + "auxiliary_loss_clip": 0.01565082, + "auxiliary_loss_mlp": 0.01342306, + "balance_loss_clip": 1.19018555, + "balance_loss_mlp": 1.03426981, + "epoch": 0.2540732279203992, + "flos": 25230822340320.0, + "grad_norm": 1.818081396166767, + "language_loss": 0.82101905, + "learning_rate": 3.4961867671162917e-06, + "loss": 0.85009295, + "num_input_tokens_seen": 45106385, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 3.07617188, + "step": 2113, + "time_per_iteration": 3.0230283737182617 + }, + { + "auxiliary_loss_clip": 0.01553626, + "auxiliary_loss_mlp": 0.01346796, + "balance_loss_clip": 1.1797874, + "balance_loss_mlp": 1.03380013, + "epoch": 0.2541934708110383, + "flos": 19429424772480.0, + "grad_norm": 3.0247079554147565, + "language_loss": 0.77165145, + "learning_rate": 3.4956697331480402e-06, + "loss": 0.80065572, + "num_input_tokens_seen": 45124955, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.12695312, + "step": 2114, + "time_per_iteration": 3.8544838428497314 + }, + { + "auxiliary_loss_clip": 0.0155127, + "auxiliary_loss_mlp": 0.01348479, + "balance_loss_clip": 1.17570412, + "balance_loss_mlp": 1.03395724, + "epoch": 0.2543137137016774, + "flos": 23951606536800.0, + "grad_norm": 1.5491534946138148, + "language_loss": 0.80225956, + "learning_rate": 3.495152472287879e-06, + "loss": 0.83125699, + "num_input_tokens_seen": 45145665, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 3.14257812, + "step": 2115, + "time_per_iteration": 3.022700548171997 + }, + { + "auxiliary_loss_clip": 0.01557758, + "auxiliary_loss_mlp": 0.01339649, + "balance_loss_clip": 1.18189597, + "balance_loss_mlp": 1.03046799, + "epoch": 0.2544339565923165, + "flos": 25595656954560.0, + "grad_norm": 1.9462394760446338, + "language_loss": 0.74216747, + "learning_rate": 3.4946349846142766e-06, + "loss": 0.77114153, + "num_input_tokens_seen": 45164805, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.08789062, + "step": 2116, + "time_per_iteration": 2.9641613960266113 + }, + { + "auxiliary_loss_clip": 0.0154639, + "auxiliary_loss_mlp": 0.01338759, + "balance_loss_clip": 1.17184114, + "balance_loss_mlp": 1.03015018, + "epoch": 0.25455419948295555, + "flos": 21691634535360.0, + "grad_norm": 1.953009273899986, + "language_loss": 0.7615428, + "learning_rate": 3.4941172702057353e-06, + "loss": 0.79039431, + "num_input_tokens_seen": 45184865, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.08203125, + "step": 2117, + "time_per_iteration": 3.0448925495147705 + }, + { + "auxiliary_loss_clip": 0.01561194, + "auxiliary_loss_mlp": 0.01342366, + "balance_loss_clip": 1.18640757, + "balance_loss_mlp": 1.02822578, + "epoch": 0.25467444237359466, + "flos": 26252768884320.0, + "grad_norm": 2.1045596991320594, + "language_loss": 0.8073563, + "learning_rate": 3.4935993291407924e-06, + "loss": 0.83639193, + "num_input_tokens_seen": 45203690, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.13867188, + "step": 2118, + "time_per_iteration": 2.9594624042510986 + }, + { + "auxiliary_loss_clip": 0.01556566, + "auxiliary_loss_mlp": 0.01352682, + "balance_loss_clip": 1.1822927, + "balance_loss_mlp": 1.03682494, + "epoch": 0.25479468526423377, + "flos": 26982134687520.0, + "grad_norm": 2.3138362885554287, + "language_loss": 0.71363568, + "learning_rate": 3.4930811614980183e-06, + "loss": 0.74272817, + "num_input_tokens_seen": 45225385, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.15625, + "step": 2119, + "time_per_iteration": 3.0225510597229004 + }, + { + "auxiliary_loss_clip": 0.01549934, + "auxiliary_loss_mlp": 0.01352091, + "balance_loss_clip": 1.17350841, + "balance_loss_mlp": 1.03909492, + "epoch": 0.2549149281548728, + "flos": 23477992866720.0, + "grad_norm": 1.721339743567734, + "language_loss": 0.79350841, + "learning_rate": 3.4925627673560198e-06, + "loss": 0.82252872, + "num_input_tokens_seen": 45246045, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 3.12695312, + "step": 2120, + "time_per_iteration": 3.0666182041168213 + }, + { + "auxiliary_loss_clip": 0.01546288, + "auxiliary_loss_mlp": 0.01351422, + "balance_loss_clip": 1.1722796, + "balance_loss_mlp": 1.0407151, + "epoch": 0.25503517104551193, + "flos": 25814694264480.0, + "grad_norm": 1.821744698602626, + "language_loss": 0.88518894, + "learning_rate": 3.4920441467934357e-06, + "loss": 0.91416597, + "num_input_tokens_seen": 45266560, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.10351562, + "step": 2121, + "time_per_iteration": 2.965358257293701 + }, + { + "auxiliary_loss_clip": 0.01554882, + "auxiliary_loss_mlp": 0.01340041, + "balance_loss_clip": 1.17919827, + "balance_loss_mlp": 1.0302875, + "epoch": 0.25515541393615104, + "flos": 26647187827680.0, + "grad_norm": 3.453051716035394, + "language_loss": 0.83162272, + "learning_rate": 3.491525299888941e-06, + "loss": 0.86057198, + "num_input_tokens_seen": 45285405, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 3.09375, + "step": 2122, + "time_per_iteration": 3.016308307647705 + }, + { + "auxiliary_loss_clip": 0.01598371, + "auxiliary_loss_mlp": 0.01321259, + "balance_loss_clip": 1.21538925, + "balance_loss_mlp": 1.06491089, + "epoch": 0.2552756568267901, + "flos": 65963151928800.0, + "grad_norm": 0.9018293297173485, + "language_loss": 0.62681729, + "learning_rate": 3.491006226721244e-06, + "loss": 0.65601361, + "num_input_tokens_seen": 45349615, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 2.5625, + "step": 2123, + "time_per_iteration": 3.518082618713379 + }, + { + "auxiliary_loss_clip": 0.01556631, + "auxiliary_loss_mlp": 0.01345346, + "balance_loss_clip": 1.17913139, + "balance_loss_mlp": 1.03425753, + "epoch": 0.2553958997174292, + "flos": 17933030138880.0, + "grad_norm": 2.17014857407184, + "language_loss": 0.77692825, + "learning_rate": 3.4904869273690882e-06, + "loss": 0.80594802, + "num_input_tokens_seen": 45367505, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 3.10742188, + "step": 2124, + "time_per_iteration": 3.0054359436035156 + }, + { + "auxiliary_loss_clip": 0.01548398, + "auxiliary_loss_mlp": 0.01337729, + "balance_loss_clip": 1.17375433, + "balance_loss_mlp": 1.02721262, + "epoch": 0.2555161426080683, + "flos": 23370048230400.0, + "grad_norm": 2.1652376292943543, + "language_loss": 0.89232999, + "learning_rate": 3.489967401911251e-06, + "loss": 0.92119133, + "num_input_tokens_seen": 45386805, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.1015625, + "step": 2125, + "time_per_iteration": 3.063032627105713 + }, + { + "auxiliary_loss_clip": 0.01548971, + "auxiliary_loss_mlp": 0.01342535, + "balance_loss_clip": 1.17342687, + "balance_loss_mlp": 1.02992022, + "epoch": 0.2556363854987074, + "flos": 40628329845120.0, + "grad_norm": 1.9519860962749882, + "language_loss": 0.69620329, + "learning_rate": 3.4894476504265428e-06, + "loss": 0.72511834, + "num_input_tokens_seen": 45411045, + "router_z_loss_clip": 3.75585938, + "router_z_loss_mlp": 3.12304688, + "step": 2126, + "time_per_iteration": 3.188896894454956 + }, + { + "auxiliary_loss_clip": 0.01592005, + "auxiliary_loss_mlp": 0.01254852, + "balance_loss_clip": 1.21012545, + "balance_loss_mlp": 1.00689697, + "epoch": 0.2557566283893465, + "flos": 68025857748480.0, + "grad_norm": 0.7415461069259819, + "language_loss": 0.54397273, + "learning_rate": 3.4889276729938104e-06, + "loss": 0.57244134, + "num_input_tokens_seen": 45469575, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 2.4765625, + "step": 2127, + "time_per_iteration": 3.4169118404388428 + }, + { + "auxiliary_loss_clip": 0.01550484, + "auxiliary_loss_mlp": 0.01357729, + "balance_loss_clip": 1.17568135, + "balance_loss_mlp": 1.04644966, + "epoch": 0.2558768712799856, + "flos": 22637724030720.0, + "grad_norm": 2.672281885829518, + "language_loss": 0.80630124, + "learning_rate": 3.488407469691934e-06, + "loss": 0.83538336, + "num_input_tokens_seen": 45490270, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 3.109375, + "step": 2128, + "time_per_iteration": 3.1508195400238037 + }, + { + "auxiliary_loss_clip": 0.01549692, + "auxiliary_loss_mlp": 0.01338932, + "balance_loss_clip": 1.17607629, + "balance_loss_mlp": 1.02936935, + "epoch": 0.25599711417062465, + "flos": 26398793757600.0, + "grad_norm": 2.2145599556904423, + "language_loss": 0.80999064, + "learning_rate": 3.487887040599828e-06, + "loss": 0.8388769, + "num_input_tokens_seen": 45510070, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.09179688, + "step": 2129, + "time_per_iteration": 3.130096435546875 + }, + { + "auxiliary_loss_clip": 0.01548504, + "auxiliary_loss_mlp": 0.01346766, + "balance_loss_clip": 1.17457545, + "balance_loss_mlp": 1.03720367, + "epoch": 0.25611735706126376, + "flos": 22853992584960.0, + "grad_norm": 2.331354499706239, + "language_loss": 0.76073909, + "learning_rate": 3.4873663857964407e-06, + "loss": 0.78969181, + "num_input_tokens_seen": 45527285, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 3.09179688, + "step": 2130, + "time_per_iteration": 3.1741793155670166 + }, + { + "auxiliary_loss_clip": 0.01546362, + "auxiliary_loss_mlp": 0.01337988, + "balance_loss_clip": 1.17326379, + "balance_loss_mlp": 1.02842522, + "epoch": 0.2562375999519028, + "flos": 23370124086720.0, + "grad_norm": 1.9008065382182515, + "language_loss": 0.6647675, + "learning_rate": 3.4868455053607556e-06, + "loss": 0.69361103, + "num_input_tokens_seen": 45546900, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.09179688, + "step": 2131, + "time_per_iteration": 3.0873870849609375 + }, + { + "auxiliary_loss_clip": 0.01545376, + "auxiliary_loss_mlp": 0.01349603, + "balance_loss_clip": 1.17127085, + "balance_loss_mlp": 1.03469992, + "epoch": 0.2563578428425419, + "flos": 22859188742880.0, + "grad_norm": 2.432228234147216, + "language_loss": 0.71967721, + "learning_rate": 3.486324399371789e-06, + "loss": 0.74862695, + "num_input_tokens_seen": 45566200, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 3.14648438, + "step": 2132, + "time_per_iteration": 3.779324769973755 + }, + { + "auxiliary_loss_clip": 0.01553244, + "auxiliary_loss_mlp": 0.01347034, + "balance_loss_clip": 1.17938614, + "balance_loss_mlp": 1.0395695, + "epoch": 0.25647808573318104, + "flos": 21656095485120.0, + "grad_norm": 2.083226977325276, + "language_loss": 0.78718519, + "learning_rate": 3.485803067908593e-06, + "loss": 0.81618798, + "num_input_tokens_seen": 45585710, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.0703125, + "step": 2133, + "time_per_iteration": 3.898836135864258 + }, + { + "auxiliary_loss_clip": 0.01537758, + "auxiliary_loss_mlp": 0.01347606, + "balance_loss_clip": 1.16314411, + "balance_loss_mlp": 1.03270304, + "epoch": 0.2565983286238201, + "flos": 33733452494880.0, + "grad_norm": 1.8130513023659125, + "language_loss": 0.79760826, + "learning_rate": 3.485281511050253e-06, + "loss": 0.82646191, + "num_input_tokens_seen": 45607845, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 3.14648438, + "step": 2134, + "time_per_iteration": 3.07944917678833 + }, + { + "auxiliary_loss_clip": 0.01546753, + "auxiliary_loss_mlp": 0.01346838, + "balance_loss_clip": 1.17343569, + "balance_loss_mlp": 1.03574991, + "epoch": 0.2567185715144592, + "flos": 16217825764320.0, + "grad_norm": 4.476662582521227, + "language_loss": 0.9035176, + "learning_rate": 3.484759728875889e-06, + "loss": 0.93245351, + "num_input_tokens_seen": 45623210, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 3.10742188, + "step": 2135, + "time_per_iteration": 2.918548345565796 + }, + { + "auxiliary_loss_clip": 0.01547634, + "auxiliary_loss_mlp": 0.0136593, + "balance_loss_clip": 1.17471588, + "balance_loss_mlp": 1.05503201, + "epoch": 0.2568388144050983, + "flos": 17460478457280.0, + "grad_norm": 1.8783890421168246, + "language_loss": 0.81092286, + "learning_rate": 3.4842377214646543e-06, + "loss": 0.84005845, + "num_input_tokens_seen": 45641505, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.10546875, + "step": 2136, + "time_per_iteration": 3.0348548889160156 + }, + { + "auxiliary_loss_clip": 0.01548374, + "auxiliary_loss_mlp": 0.01338532, + "balance_loss_clip": 1.17374301, + "balance_loss_mlp": 1.029351, + "epoch": 0.25695905729573737, + "flos": 20889597648960.0, + "grad_norm": 1.9420262905229855, + "language_loss": 0.66957581, + "learning_rate": 3.483715488895737e-06, + "loss": 0.69844484, + "num_input_tokens_seen": 45661835, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 3.08789062, + "step": 2137, + "time_per_iteration": 3.056114673614502 + }, + { + "auxiliary_loss_clip": 0.01541936, + "auxiliary_loss_mlp": 0.01341109, + "balance_loss_clip": 1.16681874, + "balance_loss_mlp": 1.02792239, + "epoch": 0.2570793001863765, + "flos": 24719583571200.0, + "grad_norm": 1.8322625645190487, + "language_loss": 0.78487253, + "learning_rate": 3.48319303124836e-06, + "loss": 0.81370294, + "num_input_tokens_seen": 45682215, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 3.12890625, + "step": 2138, + "time_per_iteration": 3.8535027503967285 + }, + { + "auxiliary_loss_clip": 0.01549609, + "auxiliary_loss_mlp": 0.01339298, + "balance_loss_clip": 1.17668891, + "balance_loss_mlp": 1.02573037, + "epoch": 0.2571995430770156, + "flos": 26909349819840.0, + "grad_norm": 2.8234058106611575, + "language_loss": 0.67133641, + "learning_rate": 3.4826703486017798e-06, + "loss": 0.70022547, + "num_input_tokens_seen": 45701840, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.1328125, + "step": 2139, + "time_per_iteration": 3.083522081375122 + }, + { + "auxiliary_loss_clip": 0.01547889, + "auxiliary_loss_mlp": 0.0134893, + "balance_loss_clip": 1.17408895, + "balance_loss_mlp": 1.03879476, + "epoch": 0.25731978596765465, + "flos": 19794638668320.0, + "grad_norm": 1.6238978142664915, + "language_loss": 0.76882708, + "learning_rate": 3.4821474410352867e-06, + "loss": 0.7977953, + "num_input_tokens_seen": 45720500, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 3.09765625, + "step": 2140, + "time_per_iteration": 3.7691826820373535 + }, + { + "auxiliary_loss_clip": 0.0158451, + "auxiliary_loss_mlp": 0.01248619, + "balance_loss_clip": 1.20436132, + "balance_loss_mlp": 1.00524139, + "epoch": 0.25744002885829376, + "flos": 70571052427680.0, + "grad_norm": 0.9239531691895155, + "language_loss": 0.62592316, + "learning_rate": 3.481624308628205e-06, + "loss": 0.65425444, + "num_input_tokens_seen": 45781870, + "router_z_loss_clip": 3.796875, + "router_z_loss_mlp": 2.4296875, + "step": 2141, + "time_per_iteration": 3.5882327556610107 + }, + { + "auxiliary_loss_clip": 0.01546521, + "auxiliary_loss_mlp": 0.01353277, + "balance_loss_clip": 1.17376637, + "balance_loss_mlp": 1.03932726, + "epoch": 0.25756027174893287, + "flos": 18039988643040.0, + "grad_norm": 3.397423237399808, + "language_loss": 1.0024668, + "learning_rate": 3.481100951459893e-06, + "loss": 1.03146482, + "num_input_tokens_seen": 45794890, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 3.13671875, + "step": 2142, + "time_per_iteration": 3.0205421447753906 + }, + { + "auxiliary_loss_clip": 0.01545172, + "auxiliary_loss_mlp": 0.01350307, + "balance_loss_clip": 1.17241979, + "balance_loss_mlp": 1.03998184, + "epoch": 0.2576805146395719, + "flos": 22676259405600.0, + "grad_norm": 1.6777367972227646, + "language_loss": 0.78789723, + "learning_rate": 3.4805773696097453e-06, + "loss": 0.81685197, + "num_input_tokens_seen": 45815780, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.09960938, + "step": 2143, + "time_per_iteration": 3.1364831924438477 + }, + { + "auxiliary_loss_clip": 0.01548443, + "auxiliary_loss_mlp": 0.01360403, + "balance_loss_clip": 1.17836297, + "balance_loss_mlp": 1.04664421, + "epoch": 0.25780075753021103, + "flos": 16474108891680.0, + "grad_norm": 3.178078529811469, + "language_loss": 0.87795794, + "learning_rate": 3.4800535631571874e-06, + "loss": 0.90704644, + "num_input_tokens_seen": 45831310, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.13476562, + "step": 2144, + "time_per_iteration": 3.0009942054748535 + }, + { + "auxiliary_loss_clip": 0.01547647, + "auxiliary_loss_mlp": 0.01358983, + "balance_loss_clip": 1.17640328, + "balance_loss_mlp": 1.0456059, + "epoch": 0.25792100042085014, + "flos": 22822511847840.0, + "grad_norm": 2.21589066171559, + "language_loss": 0.76662457, + "learning_rate": 3.4795295321816804e-06, + "loss": 0.79569089, + "num_input_tokens_seen": 45850135, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 3.13085938, + "step": 2145, + "time_per_iteration": 3.1281492710113525 + }, + { + "auxiliary_loss_clip": 0.01546027, + "auxiliary_loss_mlp": 0.01345601, + "balance_loss_clip": 1.17339611, + "balance_loss_mlp": 1.03317773, + "epoch": 0.2580412433114892, + "flos": 18699262477920.0, + "grad_norm": 2.3679154591924227, + "language_loss": 0.90794384, + "learning_rate": 3.47900527676272e-06, + "loss": 0.93686014, + "num_input_tokens_seen": 45868470, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 3.12109375, + "step": 2146, + "time_per_iteration": 3.084784507751465 + }, + { + "auxiliary_loss_clip": 0.01554004, + "auxiliary_loss_mlp": 0.01343077, + "balance_loss_clip": 1.18137217, + "balance_loss_mlp": 1.03122592, + "epoch": 0.2581614862021283, + "flos": 14284911565440.0, + "grad_norm": 2.9385963871314553, + "language_loss": 0.88491321, + "learning_rate": 3.478480796979835e-06, + "loss": 0.91388416, + "num_input_tokens_seen": 45886355, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.11523438, + "step": 2147, + "time_per_iteration": 3.1981101036071777 + }, + { + "auxiliary_loss_clip": 0.01548118, + "auxiliary_loss_mlp": 0.0135938, + "balance_loss_clip": 1.17704463, + "balance_loss_mlp": 1.04466724, + "epoch": 0.25828172909276736, + "flos": 29500931003040.0, + "grad_norm": 1.7053709039674565, + "language_loss": 0.78208405, + "learning_rate": 3.4779560929125894e-06, + "loss": 0.81115901, + "num_input_tokens_seen": 45907900, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.14453125, + "step": 2148, + "time_per_iteration": 3.1403744220733643 + }, + { + "auxiliary_loss_clip": 0.0158949, + "auxiliary_loss_mlp": 0.01260841, + "balance_loss_clip": 1.21264076, + "balance_loss_mlp": 1.01059723, + "epoch": 0.2584019719834065, + "flos": 67121148240000.0, + "grad_norm": 0.6943899969967754, + "language_loss": 0.56923902, + "learning_rate": 3.4774311646405783e-06, + "loss": 0.59774232, + "num_input_tokens_seen": 45977805, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 2.5, + "step": 2149, + "time_per_iteration": 3.630599021911621 + }, + { + "auxiliary_loss_clip": 0.01548256, + "auxiliary_loss_mlp": 0.01360893, + "balance_loss_clip": 1.17569637, + "balance_loss_mlp": 1.05209351, + "epoch": 0.2585222148740456, + "flos": 22895789781600.0, + "grad_norm": 2.346206475910391, + "language_loss": 0.83788633, + "learning_rate": 3.476906012243435e-06, + "loss": 0.86697781, + "num_input_tokens_seen": 45996715, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 3.08398438, + "step": 2150, + "time_per_iteration": 3.004051446914673 + }, + { + "auxiliary_loss_clip": 0.0154968, + "auxiliary_loss_mlp": 0.01351011, + "balance_loss_clip": 1.1775527, + "balance_loss_mlp": 1.04221153, + "epoch": 0.25864245776468464, + "flos": 28911521567520.0, + "grad_norm": 1.996262478730955, + "language_loss": 0.81202227, + "learning_rate": 3.476380635800824e-06, + "loss": 0.84102917, + "num_input_tokens_seen": 46017915, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 3.08398438, + "step": 2151, + "time_per_iteration": 3.1107840538024902 + }, + { + "auxiliary_loss_clip": 0.0154431, + "auxiliary_loss_mlp": 0.0134517, + "balance_loss_clip": 1.1721561, + "balance_loss_mlp": 1.03637075, + "epoch": 0.25876270065532375, + "flos": 14794936633440.0, + "grad_norm": 2.406445730605538, + "language_loss": 0.85782969, + "learning_rate": 3.475855035392444e-06, + "loss": 0.88672447, + "num_input_tokens_seen": 46033235, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 3.08398438, + "step": 2152, + "time_per_iteration": 3.0093791484832764 + }, + { + "auxiliary_loss_clip": 0.01542535, + "auxiliary_loss_mlp": 0.01344149, + "balance_loss_clip": 1.1726048, + "balance_loss_mlp": 1.03382337, + "epoch": 0.25888294354596286, + "flos": 60471820712160.0, + "grad_norm": 2.0162273304533675, + "language_loss": 0.71567798, + "learning_rate": 3.475329211098029e-06, + "loss": 0.7445448, + "num_input_tokens_seen": 46056390, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 3.09960938, + "step": 2153, + "time_per_iteration": 3.2908239364624023 + }, + { + "auxiliary_loss_clip": 0.01546292, + "auxiliary_loss_mlp": 0.01337071, + "balance_loss_clip": 1.17502427, + "balance_loss_mlp": 1.03017855, + "epoch": 0.2590031864366019, + "flos": 27853429122720.0, + "grad_norm": 1.6518169820411057, + "language_loss": 0.82507539, + "learning_rate": 3.4748031629973453e-06, + "loss": 0.85390896, + "num_input_tokens_seen": 46077120, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.06445312, + "step": 2154, + "time_per_iteration": 3.0121307373046875 + }, + { + "auxiliary_loss_clip": 0.01584246, + "auxiliary_loss_mlp": 0.01252937, + "balance_loss_clip": 1.20895815, + "balance_loss_mlp": 1.00650787, + "epoch": 0.25912342932724103, + "flos": 62429767132320.0, + "grad_norm": 0.9116629097584142, + "language_loss": 0.56471968, + "learning_rate": 3.4742768911701944e-06, + "loss": 0.59309149, + "num_input_tokens_seen": 46139815, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 2.4609375, + "step": 2155, + "time_per_iteration": 3.5059304237365723 + }, + { + "auxiliary_loss_clip": 0.01539257, + "auxiliary_loss_mlp": 0.01340298, + "balance_loss_clip": 1.16841888, + "balance_loss_mlp": 1.02997255, + "epoch": 0.25924367221788014, + "flos": 12380102497440.0, + "grad_norm": 28.618832555417278, + "language_loss": 0.71488982, + "learning_rate": 3.4737503956964113e-06, + "loss": 0.74368536, + "num_input_tokens_seen": 46152120, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 3.09960938, + "step": 2156, + "time_per_iteration": 2.981534004211426 + }, + { + "auxiliary_loss_clip": 0.01538887, + "auxiliary_loss_mlp": 0.01344457, + "balance_loss_clip": 1.16756177, + "balance_loss_mlp": 1.03508532, + "epoch": 0.2593639151085192, + "flos": 14576999240160.0, + "grad_norm": 3.722362235934085, + "language_loss": 0.67381859, + "learning_rate": 3.473223676655865e-06, + "loss": 0.70265198, + "num_input_tokens_seen": 46170120, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 3.08984375, + "step": 2157, + "time_per_iteration": 2.9354798793792725 + }, + { + "auxiliary_loss_clip": 0.01539617, + "auxiliary_loss_mlp": 0.01344992, + "balance_loss_clip": 1.16911983, + "balance_loss_mlp": 1.03447616, + "epoch": 0.2594841579991583, + "flos": 15232707828000.0, + "grad_norm": 1.9960321803620553, + "language_loss": 0.80133808, + "learning_rate": 3.472696734128459e-06, + "loss": 0.83018422, + "num_input_tokens_seen": 46187985, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 3.1015625, + "step": 2158, + "time_per_iteration": 3.1358091831207275 + }, + { + "auxiliary_loss_clip": 0.01539353, + "auxiliary_loss_mlp": 0.01356138, + "balance_loss_clip": 1.16998839, + "balance_loss_mlp": 1.04581201, + "epoch": 0.2596044008897974, + "flos": 23625876219840.0, + "grad_norm": 2.039075525882788, + "language_loss": 0.75887144, + "learning_rate": 3.4721695681941286e-06, + "loss": 0.78782636, + "num_input_tokens_seen": 46207025, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.09960938, + "step": 2159, + "time_per_iteration": 3.8975212574005127 + }, + { + "auxiliary_loss_clip": 0.01536084, + "auxiliary_loss_mlp": 0.01337239, + "balance_loss_clip": 1.16700697, + "balance_loss_mlp": 1.02920222, + "epoch": 0.25972464378043647, + "flos": 13774014149760.0, + "grad_norm": 2.0563884658025877, + "language_loss": 0.82737815, + "learning_rate": 3.471642178932845e-06, + "loss": 0.85611141, + "num_input_tokens_seen": 46225670, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.07617188, + "step": 2160, + "time_per_iteration": 3.8176262378692627 + }, + { + "auxiliary_loss_clip": 0.01536426, + "auxiliary_loss_mlp": 0.0133587, + "balance_loss_clip": 1.16665673, + "balance_loss_mlp": 1.02878737, + "epoch": 0.2598448866710756, + "flos": 19575639286560.0, + "grad_norm": 2.6098931281815574, + "language_loss": 0.8987602, + "learning_rate": 3.471114566424613e-06, + "loss": 0.92748314, + "num_input_tokens_seen": 46244130, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.06640625, + "step": 2161, + "time_per_iteration": 2.932649612426758 + }, + { + "auxiliary_loss_clip": 0.0154421, + "auxiliary_loss_mlp": 0.01354832, + "balance_loss_clip": 1.17443681, + "balance_loss_mlp": 1.04507864, + "epoch": 0.25996512956171464, + "flos": 21655412778240.0, + "grad_norm": 2.645418165600112, + "language_loss": 0.75875449, + "learning_rate": 3.4705867307494715e-06, + "loss": 0.787745, + "num_input_tokens_seen": 46263200, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 3.09375, + "step": 2162, + "time_per_iteration": 2.9934473037719727 + }, + { + "auxiliary_loss_clip": 0.01543939, + "auxiliary_loss_mlp": 0.01344807, + "balance_loss_clip": 1.17459393, + "balance_loss_mlp": 1.03905952, + "epoch": 0.26008537245235375, + "flos": 18225459167040.0, + "grad_norm": 2.5844804029475412, + "language_loss": 0.85256958, + "learning_rate": 3.470058671987492e-06, + "loss": 0.88145703, + "num_input_tokens_seen": 46281465, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 3.05273438, + "step": 2163, + "time_per_iteration": 3.0621368885040283 + }, + { + "auxiliary_loss_clip": 0.0153899, + "auxiliary_loss_mlp": 0.01355941, + "balance_loss_clip": 1.17159235, + "balance_loss_mlp": 1.0423727, + "epoch": 0.26020561534299286, + "flos": 24647746907520.0, + "grad_norm": 2.2596688102225437, + "language_loss": 0.84730053, + "learning_rate": 3.4695303902187805e-06, + "loss": 0.87624979, + "num_input_tokens_seen": 46301020, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.1328125, + "step": 2164, + "time_per_iteration": 3.010035753250122 + }, + { + "auxiliary_loss_clip": 0.01538634, + "auxiliary_loss_mlp": 0.0134147, + "balance_loss_clip": 1.16684794, + "balance_loss_mlp": 1.03247905, + "epoch": 0.2603258582336319, + "flos": 25775931320640.0, + "grad_norm": 2.083825540630596, + "language_loss": 0.79083407, + "learning_rate": 3.469001885523478e-06, + "loss": 0.81963509, + "num_input_tokens_seen": 46321740, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.0859375, + "step": 2165, + "time_per_iteration": 3.8526885509490967 + }, + { + "auxiliary_loss_clip": 0.01539437, + "auxiliary_loss_mlp": 0.01349024, + "balance_loss_clip": 1.17161214, + "balance_loss_mlp": 1.04098773, + "epoch": 0.260446101124271, + "flos": 28768796444160.0, + "grad_norm": 2.4764592922790265, + "language_loss": 0.8101632, + "learning_rate": 3.4684731579817568e-06, + "loss": 0.83904779, + "num_input_tokens_seen": 46342730, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.07617188, + "step": 2166, + "time_per_iteration": 3.0051770210266113 + }, + { + "auxiliary_loss_clip": 0.01545741, + "auxiliary_loss_mlp": 0.01343694, + "balance_loss_clip": 1.17744958, + "balance_loss_mlp": 1.03947139, + "epoch": 0.26056634401491013, + "flos": 25669162457280.0, + "grad_norm": 1.6909616419292728, + "language_loss": 0.7633909, + "learning_rate": 3.4679442076738247e-06, + "loss": 0.79228532, + "num_input_tokens_seen": 46362445, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.03710938, + "step": 2167, + "time_per_iteration": 3.0648322105407715 + }, + { + "auxiliary_loss_clip": 0.01542711, + "auxiliary_loss_mlp": 0.01357229, + "balance_loss_clip": 1.17239189, + "balance_loss_mlp": 1.04862022, + "epoch": 0.2606865869055492, + "flos": 27055222980480.0, + "grad_norm": 3.1215150649221504, + "language_loss": 0.83683789, + "learning_rate": 3.4674150346799245e-06, + "loss": 0.86583734, + "num_input_tokens_seen": 46382145, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 3.08203125, + "step": 2168, + "time_per_iteration": 3.757871150970459 + }, + { + "auxiliary_loss_clip": 0.01546614, + "auxiliary_loss_mlp": 0.01352286, + "balance_loss_clip": 1.178563, + "balance_loss_mlp": 1.04367721, + "epoch": 0.2608068297961883, + "flos": 17714561751360.0, + "grad_norm": 2.1536142019192335, + "language_loss": 0.80297959, + "learning_rate": 3.4668856390803295e-06, + "loss": 0.83196855, + "num_input_tokens_seen": 46400025, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.08203125, + "step": 2169, + "time_per_iteration": 2.9798786640167236 + }, + { + "auxiliary_loss_clip": 0.01546804, + "auxiliary_loss_mlp": 0.01346332, + "balance_loss_clip": 1.17574143, + "balance_loss_mlp": 1.04420817, + "epoch": 0.2609270726868274, + "flos": 18553616886240.0, + "grad_norm": 2.378458610682486, + "language_loss": 0.90047693, + "learning_rate": 3.4663560209553495e-06, + "loss": 0.92940831, + "num_input_tokens_seen": 46418090, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 3.015625, + "step": 2170, + "time_per_iteration": 3.0532610416412354 + }, + { + "auxiliary_loss_clip": 0.0154423, + "auxiliary_loss_mlp": 0.01340712, + "balance_loss_clip": 1.17637348, + "balance_loss_mlp": 1.03324711, + "epoch": 0.26104731557746647, + "flos": 21837886977600.0, + "grad_norm": 2.2202978030322433, + "language_loss": 0.79604244, + "learning_rate": 3.4658261803853267e-06, + "loss": 0.82489192, + "num_input_tokens_seen": 46436015, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.0703125, + "step": 2171, + "time_per_iteration": 3.042109727859497 + }, + { + "auxiliary_loss_clip": 0.01550899, + "auxiliary_loss_mlp": 0.01340213, + "balance_loss_clip": 1.183079, + "balance_loss_mlp": 1.03160441, + "epoch": 0.2611675584681056, + "flos": 21691824176160.0, + "grad_norm": 2.542114776373426, + "language_loss": 0.80900955, + "learning_rate": 3.4652961174506383e-06, + "loss": 0.83792067, + "num_input_tokens_seen": 46455885, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 3.08203125, + "step": 2172, + "time_per_iteration": 2.981719970703125 + }, + { + "auxiliary_loss_clip": 0.01593123, + "auxiliary_loss_mlp": 0.01234879, + "balance_loss_clip": 1.22192502, + "balance_loss_mlp": 0.99378967, + "epoch": 0.2612878013587447, + "flos": 71869384023840.0, + "grad_norm": 1.472659919201421, + "language_loss": 0.58074194, + "learning_rate": 3.464765832231694e-06, + "loss": 0.60902196, + "num_input_tokens_seen": 46510050, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 2.40625, + "step": 2173, + "time_per_iteration": 3.4984323978424072 + }, + { + "auxiliary_loss_clip": 0.01549117, + "auxiliary_loss_mlp": 0.01348926, + "balance_loss_clip": 1.18230963, + "balance_loss_mlp": 1.03726506, + "epoch": 0.26140804424938374, + "flos": 20229451466400.0, + "grad_norm": 1.839675434918022, + "language_loss": 0.70962775, + "learning_rate": 3.4642353248089373e-06, + "loss": 0.73860818, + "num_input_tokens_seen": 46528810, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.11328125, + "step": 2174, + "time_per_iteration": 2.965205192565918 + }, + { + "auxiliary_loss_clip": 0.01544127, + "auxiliary_loss_mlp": 0.01342118, + "balance_loss_clip": 1.17651355, + "balance_loss_mlp": 1.03179216, + "epoch": 0.26152828714002285, + "flos": 25559473125600.0, + "grad_norm": 1.7353296346363034, + "language_loss": 0.79961407, + "learning_rate": 3.463704595262846e-06, + "loss": 0.82847655, + "num_input_tokens_seen": 46549690, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.09960938, + "step": 2175, + "time_per_iteration": 3.071359872817993 + }, + { + "auxiliary_loss_clip": 0.01545448, + "auxiliary_loss_mlp": 0.01347235, + "balance_loss_clip": 1.17792892, + "balance_loss_mlp": 1.0351932, + "epoch": 0.26164853003066196, + "flos": 25448835589920.0, + "grad_norm": 3.8730484132759626, + "language_loss": 0.70642567, + "learning_rate": 3.463173643673931e-06, + "loss": 0.73535252, + "num_input_tokens_seen": 46572215, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.1171875, + "step": 2176, + "time_per_iteration": 3.145724058151245 + }, + { + "auxiliary_loss_clip": 0.01598541, + "auxiliary_loss_mlp": 0.012603, + "balance_loss_clip": 1.22831023, + "balance_loss_mlp": 1.01615906, + "epoch": 0.261768772921301, + "flos": 53950690729440.0, + "grad_norm": 0.9019102642557515, + "language_loss": 0.63465559, + "learning_rate": 3.4626424701227387e-06, + "loss": 0.66324401, + "num_input_tokens_seen": 46627275, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 2.4375, + "step": 2177, + "time_per_iteration": 3.327173948287964 + }, + { + "auxiliary_loss_clip": 0.01597325, + "auxiliary_loss_mlp": 0.01258041, + "balance_loss_clip": 1.22682619, + "balance_loss_mlp": 1.01390076, + "epoch": 0.26188901581194013, + "flos": 70694433825120.0, + "grad_norm": 0.8207160691438913, + "language_loss": 0.55669367, + "learning_rate": 3.4621110746898452e-06, + "loss": 0.58524734, + "num_input_tokens_seen": 46695135, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 2.4375, + "step": 2178, + "time_per_iteration": 3.539785623550415 + }, + { + "auxiliary_loss_clip": 0.01542431, + "auxiliary_loss_mlp": 0.01355238, + "balance_loss_clip": 1.17515659, + "balance_loss_mlp": 1.04548502, + "epoch": 0.2620092587025792, + "flos": 21071730494880.0, + "grad_norm": 1.747089711963495, + "language_loss": 0.7477293, + "learning_rate": 3.4615794574558654e-06, + "loss": 0.77670604, + "num_input_tokens_seen": 46714145, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.09375, + "step": 2179, + "time_per_iteration": 3.1440463066101074 + }, + { + "auxiliary_loss_clip": 0.0154626, + "auxiliary_loss_mlp": 0.0135337, + "balance_loss_clip": 1.17729926, + "balance_loss_mlp": 1.04228175, + "epoch": 0.2621295015932183, + "flos": 18371597824800.0, + "grad_norm": 15.668876484978659, + "language_loss": 0.84395564, + "learning_rate": 3.4610476185014436e-06, + "loss": 0.87295187, + "num_input_tokens_seen": 46731405, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 3.10742188, + "step": 2180, + "time_per_iteration": 2.9558162689208984 + }, + { + "auxiliary_loss_clip": 0.01541998, + "auxiliary_loss_mlp": 0.01344869, + "balance_loss_clip": 1.17277205, + "balance_loss_mlp": 1.03206325, + "epoch": 0.2622497444838574, + "flos": 23662287617760.0, + "grad_norm": 4.2718146436021165, + "language_loss": 0.79838479, + "learning_rate": 3.4605155579072597e-06, + "loss": 0.82725346, + "num_input_tokens_seen": 46751260, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 3.125, + "step": 2181, + "time_per_iteration": 3.1014244556427 + }, + { + "auxiliary_loss_clip": 0.01542309, + "auxiliary_loss_mlp": 0.01338254, + "balance_loss_clip": 1.17368221, + "balance_loss_mlp": 1.02697444, + "epoch": 0.26236998737449646, + "flos": 22125802554720.0, + "grad_norm": 1.8397243476922605, + "language_loss": 0.71434426, + "learning_rate": 3.459983275754027e-06, + "loss": 0.74314994, + "num_input_tokens_seen": 46770155, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 3.109375, + "step": 2182, + "time_per_iteration": 3.0244429111480713 + }, + { + "auxiliary_loss_clip": 0.01542215, + "auxiliary_loss_mlp": 0.01342173, + "balance_loss_clip": 1.17407763, + "balance_loss_mlp": 1.03451812, + "epoch": 0.26249023026513557, + "flos": 17897149735200.0, + "grad_norm": 3.0779593094399202, + "language_loss": 0.79785013, + "learning_rate": 3.4594507721224918e-06, + "loss": 0.82669395, + "num_input_tokens_seen": 46788805, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.07226562, + "step": 2183, + "time_per_iteration": 2.979675769805908 + }, + { + "auxiliary_loss_clip": 0.01548644, + "auxiliary_loss_mlp": 0.01349371, + "balance_loss_clip": 1.17959809, + "balance_loss_mlp": 1.04381371, + "epoch": 0.2626104731557747, + "flos": 18334920929760.0, + "grad_norm": 1.7398207135052026, + "language_loss": 0.81875348, + "learning_rate": 3.4589180470934353e-06, + "loss": 0.84773356, + "num_input_tokens_seen": 46808670, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 3.05078125, + "step": 2184, + "time_per_iteration": 3.0305464267730713 + }, + { + "auxiliary_loss_clip": 0.01538163, + "auxiliary_loss_mlp": 0.01343001, + "balance_loss_clip": 1.17013705, + "balance_loss_mlp": 1.0326755, + "epoch": 0.26273071604641374, + "flos": 19319735440800.0, + "grad_norm": 1.7129039095336882, + "language_loss": 0.76763904, + "learning_rate": 3.4583851007476713e-06, + "loss": 0.79645067, + "num_input_tokens_seen": 46827140, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.09960938, + "step": 2185, + "time_per_iteration": 3.160463809967041 + }, + { + "auxiliary_loss_clip": 0.01543823, + "auxiliary_loss_mlp": 0.01347647, + "balance_loss_clip": 1.17541111, + "balance_loss_mlp": 1.03541422, + "epoch": 0.26285095893705285, + "flos": 18329155849440.0, + "grad_norm": 2.1083035875608376, + "language_loss": 0.685278, + "learning_rate": 3.4578519331660464e-06, + "loss": 0.71419263, + "num_input_tokens_seen": 46844135, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 3.11914062, + "step": 2186, + "time_per_iteration": 3.9657697677612305 + }, + { + "auxiliary_loss_clip": 0.01555679, + "auxiliary_loss_mlp": 0.01347694, + "balance_loss_clip": 1.18805158, + "balance_loss_mlp": 1.03984833, + "epoch": 0.26297120182769196, + "flos": 20195922608640.0, + "grad_norm": 3.5314086681248864, + "language_loss": 0.81970268, + "learning_rate": 3.4573185444294426e-06, + "loss": 0.84873641, + "num_input_tokens_seen": 46862500, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.07421875, + "step": 2187, + "time_per_iteration": 4.081469297409058 + }, + { + "auxiliary_loss_clip": 0.01549466, + "auxiliary_loss_mlp": 0.01353194, + "balance_loss_clip": 1.18294215, + "balance_loss_mlp": 1.04096103, + "epoch": 0.263091444718331, + "flos": 22420583128800.0, + "grad_norm": 4.174959969376161, + "language_loss": 0.78851569, + "learning_rate": 3.456784934618774e-06, + "loss": 0.81754225, + "num_input_tokens_seen": 46883665, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.11914062, + "step": 2188, + "time_per_iteration": 3.0317330360412598 + }, + { + "auxiliary_loss_clip": 0.01550946, + "auxiliary_loss_mlp": 0.01342057, + "balance_loss_clip": 1.18387496, + "balance_loss_mlp": 1.03516459, + "epoch": 0.2632116876089701, + "flos": 19026775418400.0, + "grad_norm": 3.258865538314204, + "language_loss": 0.80157149, + "learning_rate": 3.4562511038149897e-06, + "loss": 0.83050156, + "num_input_tokens_seen": 46899160, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.06445312, + "step": 2189, + "time_per_iteration": 3.0158567428588867 + }, + { + "auxiliary_loss_clip": 0.01584656, + "auxiliary_loss_mlp": 0.01317734, + "balance_loss_clip": 1.21591628, + "balance_loss_mlp": 1.08427429, + "epoch": 0.26333193049960923, + "flos": 67315456025280.0, + "grad_norm": 1.1364407731517558, + "language_loss": 0.57744652, + "learning_rate": 3.4557170520990705e-06, + "loss": 0.60647047, + "num_input_tokens_seen": 46959835, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 2.328125, + "step": 2190, + "time_per_iteration": 3.6054117679595947 + }, + { + "auxiliary_loss_clip": 0.01546061, + "auxiliary_loss_mlp": 0.01343709, + "balance_loss_clip": 1.17931819, + "balance_loss_mlp": 1.03548181, + "epoch": 0.2634521733902483, + "flos": 25051306537440.0, + "grad_norm": 1.613122703221676, + "language_loss": 0.86626208, + "learning_rate": 3.4551827795520324e-06, + "loss": 0.89515972, + "num_input_tokens_seen": 46982720, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 3.078125, + "step": 2191, + "time_per_iteration": 3.0816638469696045 + }, + { + "auxiliary_loss_clip": 0.01554985, + "auxiliary_loss_mlp": 0.01338048, + "balance_loss_clip": 1.1913265, + "balance_loss_mlp": 1.02924848, + "epoch": 0.2635724162808874, + "flos": 20597092764480.0, + "grad_norm": 2.1571358199278547, + "language_loss": 0.84948981, + "learning_rate": 3.4546482862549226e-06, + "loss": 0.87842011, + "num_input_tokens_seen": 47003035, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 3.08398438, + "step": 2192, + "time_per_iteration": 3.9002978801727295 + }, + { + "auxiliary_loss_clip": 0.01552363, + "auxiliary_loss_mlp": 0.01330454, + "balance_loss_clip": 1.18755591, + "balance_loss_mlp": 1.02108157, + "epoch": 0.2636926591715265, + "flos": 19246647147840.0, + "grad_norm": 2.5088371642230234, + "language_loss": 0.78813779, + "learning_rate": 3.4541135722888253e-06, + "loss": 0.81696594, + "num_input_tokens_seen": 47019625, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.08984375, + "step": 2193, + "time_per_iteration": 3.0520377159118652 + }, + { + "auxiliary_loss_clip": 0.01547737, + "auxiliary_loss_mlp": 0.01361553, + "balance_loss_clip": 1.18239999, + "balance_loss_mlp": 1.04989219, + "epoch": 0.26381290206216557, + "flos": 28807483531680.0, + "grad_norm": 2.545284371866194, + "language_loss": 0.80317497, + "learning_rate": 3.453578637734854e-06, + "loss": 0.83226788, + "num_input_tokens_seen": 47040815, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.11328125, + "step": 2194, + "time_per_iteration": 3.066049337387085 + }, + { + "auxiliary_loss_clip": 0.01562474, + "auxiliary_loss_mlp": 0.0135718, + "balance_loss_clip": 1.19675612, + "balance_loss_mlp": 1.04361176, + "epoch": 0.2639331449528047, + "flos": 25011216108000.0, + "grad_norm": 2.255565662691435, + "language_loss": 0.78376448, + "learning_rate": 3.4530434826741605e-06, + "loss": 0.81296104, + "num_input_tokens_seen": 47061755, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.1328125, + "step": 2195, + "time_per_iteration": 3.7113378047943115 + }, + { + "auxiliary_loss_clip": 0.01556668, + "auxiliary_loss_mlp": 0.01348035, + "balance_loss_clip": 1.19104362, + "balance_loss_mlp": 1.04038, + "epoch": 0.26405338784344373, + "flos": 46539302960160.0, + "grad_norm": 1.8900402292819642, + "language_loss": 0.69222152, + "learning_rate": 3.452508107187926e-06, + "loss": 0.72126859, + "num_input_tokens_seen": 47085130, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.07226562, + "step": 2196, + "time_per_iteration": 3.2415037155151367 + }, + { + "auxiliary_loss_clip": 0.01554338, + "auxiliary_loss_mlp": 0.01360775, + "balance_loss_clip": 1.18911552, + "balance_loss_mlp": 1.04835105, + "epoch": 0.26417363073408284, + "flos": 21181647395520.0, + "grad_norm": 10.439500962452026, + "language_loss": 0.77414906, + "learning_rate": 3.451972511357366e-06, + "loss": 0.80330026, + "num_input_tokens_seen": 47104675, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.12109375, + "step": 2197, + "time_per_iteration": 3.086599826812744 + }, + { + "auxiliary_loss_clip": 0.01556726, + "auxiliary_loss_mlp": 0.01368866, + "balance_loss_clip": 1.19210672, + "balance_loss_mlp": 1.05758643, + "epoch": 0.26429387362472195, + "flos": 22676980040640.0, + "grad_norm": 1.9880895210028116, + "language_loss": 0.85609347, + "learning_rate": 3.45143669526373e-06, + "loss": 0.88534939, + "num_input_tokens_seen": 47124435, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.109375, + "step": 2198, + "time_per_iteration": 3.004032611846924 + }, + { + "auxiliary_loss_clip": 0.01581676, + "auxiliary_loss_mlp": 0.01449875, + "balance_loss_clip": 1.21372783, + "balance_loss_mlp": 1.1912384, + "epoch": 0.264414116515361, + "flos": 67186916398080.0, + "grad_norm": 0.8935371380095705, + "language_loss": 0.63140666, + "learning_rate": 3.450900658988302e-06, + "loss": 0.66172218, + "num_input_tokens_seen": 47185985, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.5859375, + "step": 2199, + "time_per_iteration": 3.4917874336242676 + }, + { + "auxiliary_loss_clip": 0.01556917, + "auxiliary_loss_mlp": 0.013492, + "balance_loss_clip": 1.19127512, + "balance_loss_mlp": 1.03658569, + "epoch": 0.2645343594060001, + "flos": 25667000552160.0, + "grad_norm": 2.701712987579926, + "language_loss": 0.78079808, + "learning_rate": 3.450364402612397e-06, + "loss": 0.80985928, + "num_input_tokens_seen": 47203140, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.12304688, + "step": 2200, + "time_per_iteration": 3.0292937755584717 + }, + { + "auxiliary_loss_clip": 0.01555158, + "auxiliary_loss_mlp": 0.0133818, + "balance_loss_clip": 1.19126987, + "balance_loss_mlp": 1.02709174, + "epoch": 0.26465460229663923, + "flos": 22494012775200.0, + "grad_norm": 2.432564623965901, + "language_loss": 0.84076226, + "learning_rate": 3.449827926217366e-06, + "loss": 0.86969566, + "num_input_tokens_seen": 47222575, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 3.10742188, + "step": 2201, + "time_per_iteration": 3.019319534301758 + }, + { + "auxiliary_loss_clip": 0.01552114, + "auxiliary_loss_mlp": 0.01344606, + "balance_loss_clip": 1.18504167, + "balance_loss_mlp": 1.03752255, + "epoch": 0.2647748451872783, + "flos": 29390634820800.0, + "grad_norm": 2.973300180102982, + "language_loss": 0.80775476, + "learning_rate": 3.449291229884591e-06, + "loss": 0.83672202, + "num_input_tokens_seen": 47243815, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.06640625, + "step": 2202, + "time_per_iteration": 3.1542675495147705 + }, + { + "auxiliary_loss_clip": 0.01560596, + "auxiliary_loss_mlp": 0.01344962, + "balance_loss_clip": 1.19456387, + "balance_loss_mlp": 1.03616261, + "epoch": 0.2648950880779174, + "flos": 26800267338720.0, + "grad_norm": 1.8348240887444518, + "language_loss": 0.86870003, + "learning_rate": 3.4487543136954887e-06, + "loss": 0.89775562, + "num_input_tokens_seen": 47263435, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 3.08398438, + "step": 2203, + "time_per_iteration": 3.1856770515441895 + }, + { + "auxiliary_loss_clip": 0.01553931, + "auxiliary_loss_mlp": 0.01337449, + "balance_loss_clip": 1.18835413, + "balance_loss_mlp": 1.03189123, + "epoch": 0.2650153309685565, + "flos": 28843591504320.0, + "grad_norm": 2.7279581215105484, + "language_loss": 0.91117305, + "learning_rate": 3.448217177731509e-06, + "loss": 0.94008684, + "num_input_tokens_seen": 47283920, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.05078125, + "step": 2204, + "time_per_iteration": 3.1565234661102295 + }, + { + "auxiliary_loss_clip": 0.01559962, + "auxiliary_loss_mlp": 0.01336879, + "balance_loss_clip": 1.192783, + "balance_loss_mlp": 1.03227496, + "epoch": 0.26513557385919556, + "flos": 20305346443200.0, + "grad_norm": 2.177890714792843, + "language_loss": 0.77997899, + "learning_rate": 3.4476798220741348e-06, + "loss": 0.80894744, + "num_input_tokens_seen": 47302800, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.04101562, + "step": 2205, + "time_per_iteration": 3.093287467956543 + }, + { + "auxiliary_loss_clip": 0.01559777, + "auxiliary_loss_mlp": 0.01357563, + "balance_loss_clip": 1.19529676, + "balance_loss_mlp": 1.05334091, + "epoch": 0.26525581674983467, + "flos": 17678226209760.0, + "grad_norm": 1.6115225429903242, + "language_loss": 0.78313744, + "learning_rate": 3.4471422468048826e-06, + "loss": 0.81231081, + "num_input_tokens_seen": 47321525, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.03710938, + "step": 2206, + "time_per_iteration": 3.1550748348236084 + }, + { + "auxiliary_loss_clip": 0.01552325, + "auxiliary_loss_mlp": 0.01351889, + "balance_loss_clip": 1.18588161, + "balance_loss_mlp": 1.04556894, + "epoch": 0.2653760596404738, + "flos": 26836185670560.0, + "grad_norm": 4.778376948383422, + "language_loss": 0.73212457, + "learning_rate": 3.4466044520053022e-06, + "loss": 0.76116675, + "num_input_tokens_seen": 47340530, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.05859375, + "step": 2207, + "time_per_iteration": 3.113874673843384 + }, + { + "auxiliary_loss_clip": 0.01554086, + "auxiliary_loss_mlp": 0.01337339, + "balance_loss_clip": 1.18912053, + "balance_loss_mlp": 1.03159142, + "epoch": 0.26549630253111284, + "flos": 22784279898240.0, + "grad_norm": 1.855654605447762, + "language_loss": 0.60566109, + "learning_rate": 3.446066437756977e-06, + "loss": 0.63457537, + "num_input_tokens_seen": 47359735, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.05273438, + "step": 2208, + "time_per_iteration": 3.0165023803710938 + }, + { + "auxiliary_loss_clip": 0.01543655, + "auxiliary_loss_mlp": 0.01342774, + "balance_loss_clip": 1.1774528, + "balance_loss_mlp": 1.03759813, + "epoch": 0.26561654542175195, + "flos": 23552256932640.0, + "grad_norm": 2.1056569581999067, + "language_loss": 0.75074816, + "learning_rate": 3.4455282041415224e-06, + "loss": 0.77961242, + "num_input_tokens_seen": 47378945, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.046875, + "step": 2209, + "time_per_iteration": 3.120426893234253 + }, + { + "auxiliary_loss_clip": 0.01542293, + "auxiliary_loss_mlp": 0.01336383, + "balance_loss_clip": 1.17702675, + "balance_loss_mlp": 1.03120756, + "epoch": 0.265736788312391, + "flos": 26909122250880.0, + "grad_norm": 2.723091541674903, + "language_loss": 0.87189484, + "learning_rate": 3.4449897512405894e-06, + "loss": 0.90068161, + "num_input_tokens_seen": 47398095, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.046875, + "step": 2210, + "time_per_iteration": 3.009530782699585 + }, + { + "auxiliary_loss_clip": 0.01554078, + "auxiliary_loss_mlp": 0.01342591, + "balance_loss_clip": 1.18781424, + "balance_loss_mlp": 1.03855944, + "epoch": 0.2658570312030301, + "flos": 23479396208640.0, + "grad_norm": 2.3346164969532928, + "language_loss": 0.7531355, + "learning_rate": 3.444451079135859e-06, + "loss": 0.78210223, + "num_input_tokens_seen": 47417605, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 3.03515625, + "step": 2211, + "time_per_iteration": 3.01310658454895 + }, + { + "auxiliary_loss_clip": 0.01542393, + "auxiliary_loss_mlp": 0.01344565, + "balance_loss_clip": 1.17543149, + "balance_loss_mlp": 1.03519297, + "epoch": 0.2659772740936692, + "flos": 21868267798080.0, + "grad_norm": 2.7146727009171943, + "language_loss": 0.74375802, + "learning_rate": 3.4439121879090493e-06, + "loss": 0.77262759, + "num_input_tokens_seen": 47435385, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.08984375, + "step": 2212, + "time_per_iteration": 3.0149459838867188 + }, + { + "auxiliary_loss_clip": 0.01547475, + "auxiliary_loss_mlp": 0.01338113, + "balance_loss_clip": 1.18167889, + "balance_loss_mlp": 1.03217435, + "epoch": 0.2660975169843083, + "flos": 19795207590720.0, + "grad_norm": 2.7269164802111603, + "language_loss": 0.83384025, + "learning_rate": 3.4433730776419082e-06, + "loss": 0.86269611, + "num_input_tokens_seen": 47454310, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.0546875, + "step": 2213, + "time_per_iteration": 3.791919231414795 + }, + { + "auxiliary_loss_clip": 0.01551208, + "auxiliary_loss_mlp": 0.01342619, + "balance_loss_clip": 1.18423605, + "balance_loss_mlp": 1.032866, + "epoch": 0.2662177598749474, + "flos": 29021324683680.0, + "grad_norm": 2.1473724183359093, + "language_loss": 0.80625612, + "learning_rate": 3.4428337484162183e-06, + "loss": 0.83519435, + "num_input_tokens_seen": 47475120, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.09375, + "step": 2214, + "time_per_iteration": 3.906543731689453 + }, + { + "auxiliary_loss_clip": 0.01548783, + "auxiliary_loss_mlp": 0.01346148, + "balance_loss_clip": 1.18243456, + "balance_loss_mlp": 1.03982806, + "epoch": 0.2663380027655865, + "flos": 21764722828320.0, + "grad_norm": 4.519397502173431, + "language_loss": 0.84659159, + "learning_rate": 3.442294200313797e-06, + "loss": 0.87554097, + "num_input_tokens_seen": 47493150, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 3.05859375, + "step": 2215, + "time_per_iteration": 3.104660987854004 + }, + { + "auxiliary_loss_clip": 0.01569382, + "auxiliary_loss_mlp": 0.01240463, + "balance_loss_clip": 1.19915128, + "balance_loss_mlp": 0.99937439, + "epoch": 0.26645824565622556, + "flos": 66987753444000.0, + "grad_norm": 0.7940311084351416, + "language_loss": 0.52724063, + "learning_rate": 3.4417544334164916e-06, + "loss": 0.5553391, + "num_input_tokens_seen": 47557295, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 2.40625, + "step": 2216, + "time_per_iteration": 3.5275251865386963 + }, + { + "auxiliary_loss_clip": 0.01539308, + "auxiliary_loss_mlp": 0.01351102, + "balance_loss_clip": 1.1727314, + "balance_loss_mlp": 1.04172993, + "epoch": 0.26657848854686467, + "flos": 25266513103200.0, + "grad_norm": 1.7081073117662913, + "language_loss": 0.7779398, + "learning_rate": 3.4412144478061854e-06, + "loss": 0.80684388, + "num_input_tokens_seen": 47579705, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 3.08984375, + "step": 2217, + "time_per_iteration": 2.9787936210632324 + }, + { + "auxiliary_loss_clip": 0.01542768, + "auxiliary_loss_mlp": 0.01356378, + "balance_loss_clip": 1.1758275, + "balance_loss_mlp": 1.04548001, + "epoch": 0.2666987314375038, + "flos": 23699078297280.0, + "grad_norm": 1.879961441846633, + "language_loss": 0.75389457, + "learning_rate": 3.4406742435647925e-06, + "loss": 0.78288603, + "num_input_tokens_seen": 47599770, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.10546875, + "step": 2218, + "time_per_iteration": 3.0084915161132812 + }, + { + "auxiliary_loss_clip": 0.01546028, + "auxiliary_loss_mlp": 0.01351703, + "balance_loss_clip": 1.17880321, + "balance_loss_mlp": 1.04576433, + "epoch": 0.26681897432814283, + "flos": 27051202595520.0, + "grad_norm": 3.8588070844643982, + "language_loss": 0.79091728, + "learning_rate": 3.440133820774263e-06, + "loss": 0.81989455, + "num_input_tokens_seen": 47619580, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.0546875, + "step": 2219, + "time_per_iteration": 3.9963719844818115 + }, + { + "auxiliary_loss_clip": 0.01546047, + "auxiliary_loss_mlp": 0.01358355, + "balance_loss_clip": 1.1789633, + "balance_loss_mlp": 1.04497719, + "epoch": 0.26693921721878194, + "flos": 28988478532800.0, + "grad_norm": 2.3215191985607704, + "language_loss": 0.81837434, + "learning_rate": 3.439593179516578e-06, + "loss": 0.84741837, + "num_input_tokens_seen": 47639490, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.13085938, + "step": 2220, + "time_per_iteration": 3.0297117233276367 + }, + { + "auxiliary_loss_clip": 0.01543079, + "auxiliary_loss_mlp": 0.01347282, + "balance_loss_clip": 1.17472231, + "balance_loss_mlp": 1.03733826, + "epoch": 0.26705946010942105, + "flos": 21510032683680.0, + "grad_norm": 2.20141263852848, + "language_loss": 0.81033719, + "learning_rate": 3.4390523198737524e-06, + "loss": 0.83924079, + "num_input_tokens_seen": 47658650, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 3.09570312, + "step": 2221, + "time_per_iteration": 3.7690043449401855 + }, + { + "auxiliary_loss_clip": 0.01540718, + "auxiliary_loss_mlp": 0.01363844, + "balance_loss_clip": 1.17341435, + "balance_loss_mlp": 1.05256462, + "epoch": 0.2671797030000601, + "flos": 21473469573120.0, + "grad_norm": 2.633401342635141, + "language_loss": 0.73713839, + "learning_rate": 3.4385112419278333e-06, + "loss": 0.76618397, + "num_input_tokens_seen": 47679875, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.109375, + "step": 2222, + "time_per_iteration": 2.983743667602539 + }, + { + "auxiliary_loss_clip": 0.01564721, + "auxiliary_loss_mlp": 0.01251282, + "balance_loss_clip": 1.19372737, + "balance_loss_mlp": 1.00714111, + "epoch": 0.2672999458906992, + "flos": 64196213179680.0, + "grad_norm": 0.7956105417385774, + "language_loss": 0.64756685, + "learning_rate": 3.4379699457609033e-06, + "loss": 0.67572689, + "num_input_tokens_seen": 47737700, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 2.4375, + "step": 2223, + "time_per_iteration": 3.323239803314209 + }, + { + "auxiliary_loss_clip": 0.01540535, + "auxiliary_loss_mlp": 0.01356709, + "balance_loss_clip": 1.17366552, + "balance_loss_mlp": 1.04638326, + "epoch": 0.26742018878133833, + "flos": 16910893954080.0, + "grad_norm": 1.9060665171530748, + "language_loss": 0.90231705, + "learning_rate": 3.4374284314550755e-06, + "loss": 0.93128949, + "num_input_tokens_seen": 47756740, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.09960938, + "step": 2224, + "time_per_iteration": 3.0267457962036133 + }, + { + "auxiliary_loss_clip": 0.01548279, + "auxiliary_loss_mlp": 0.01343452, + "balance_loss_clip": 1.18090761, + "balance_loss_mlp": 1.03465199, + "epoch": 0.2675404316719774, + "flos": 20669650063200.0, + "grad_norm": 3.0341579304586452, + "language_loss": 0.81259859, + "learning_rate": 3.436886699092498e-06, + "loss": 0.8415159, + "num_input_tokens_seen": 47775255, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.08398438, + "step": 2225, + "time_per_iteration": 3.0597870349884033 + }, + { + "auxiliary_loss_clip": 0.01543965, + "auxiliary_loss_mlp": 0.01357969, + "balance_loss_clip": 1.17662001, + "balance_loss_mlp": 1.04497337, + "epoch": 0.2676606745626165, + "flos": 17486459611200.0, + "grad_norm": 3.2588747283754653, + "language_loss": 0.71743393, + "learning_rate": 3.4363447487553502e-06, + "loss": 0.74645329, + "num_input_tokens_seen": 47788570, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.12695312, + "step": 2226, + "time_per_iteration": 2.9947657585144043 + }, + { + "auxiliary_loss_clip": 0.01551689, + "auxiliary_loss_mlp": 0.01357084, + "balance_loss_clip": 1.18541658, + "balance_loss_mlp": 1.04809308, + "epoch": 0.26778091745325555, + "flos": 27855022105440.0, + "grad_norm": 2.1411221765687225, + "language_loss": 0.77820528, + "learning_rate": 3.4358025805258455e-06, + "loss": 0.80729306, + "num_input_tokens_seen": 47808275, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 3.0859375, + "step": 2227, + "time_per_iteration": 3.0448052883148193 + }, + { + "auxiliary_loss_clip": 0.01546616, + "auxiliary_loss_mlp": 0.01345937, + "balance_loss_clip": 1.1804316, + "balance_loss_mlp": 1.03484797, + "epoch": 0.26790116034389466, + "flos": 20957906993760.0, + "grad_norm": 4.937874280009609, + "language_loss": 0.83450568, + "learning_rate": 3.435260194486232e-06, + "loss": 0.86343122, + "num_input_tokens_seen": 47826245, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 3.10742188, + "step": 2228, + "time_per_iteration": 2.9436535835266113 + }, + { + "auxiliary_loss_clip": 0.01545388, + "auxiliary_loss_mlp": 0.01361059, + "balance_loss_clip": 1.17920017, + "balance_loss_mlp": 1.04749072, + "epoch": 0.2680214032345338, + "flos": 18042605686080.0, + "grad_norm": 2.8253396548132206, + "language_loss": 0.82997251, + "learning_rate": 3.4347175907187875e-06, + "loss": 0.85903698, + "num_input_tokens_seen": 47843235, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.1328125, + "step": 2229, + "time_per_iteration": 2.936901092529297 + }, + { + "auxiliary_loss_clip": 0.01544005, + "auxiliary_loss_mlp": 0.0136083, + "balance_loss_clip": 1.17946792, + "balance_loss_mlp": 1.04821515, + "epoch": 0.26814164612517283, + "flos": 22421645117280.0, + "grad_norm": 1.7201774827162388, + "language_loss": 0.88211858, + "learning_rate": 3.4341747693058254e-06, + "loss": 0.91116691, + "num_input_tokens_seen": 47861710, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.12304688, + "step": 2230, + "time_per_iteration": 2.980123519897461 + }, + { + "auxiliary_loss_clip": 0.01546795, + "auxiliary_loss_mlp": 0.01340917, + "balance_loss_clip": 1.18211353, + "balance_loss_mlp": 1.03230822, + "epoch": 0.26826188901581194, + "flos": 35630334577440.0, + "grad_norm": 2.429717980767428, + "language_loss": 0.77283007, + "learning_rate": 3.4336317303296916e-06, + "loss": 0.80170721, + "num_input_tokens_seen": 47882685, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.08203125, + "step": 2231, + "time_per_iteration": 3.0349764823913574 + }, + { + "auxiliary_loss_clip": 0.015474, + "auxiliary_loss_mlp": 0.01338135, + "balance_loss_clip": 1.18219066, + "balance_loss_mlp": 1.03257799, + "epoch": 0.26838213190645105, + "flos": 17641738955520.0, + "grad_norm": 2.1842959992225723, + "language_loss": 0.75475538, + "learning_rate": 3.4330884738727635e-06, + "loss": 0.78361076, + "num_input_tokens_seen": 47900860, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.05078125, + "step": 2232, + "time_per_iteration": 2.99421763420105 + }, + { + "auxiliary_loss_clip": 0.0154427, + "auxiliary_loss_mlp": 0.01346903, + "balance_loss_clip": 1.17932844, + "balance_loss_mlp": 1.03848505, + "epoch": 0.2685023747970901, + "flos": 22677055896960.0, + "grad_norm": 2.1488189323894016, + "language_loss": 0.70794088, + "learning_rate": 3.4325450000174535e-06, + "loss": 0.73685265, + "num_input_tokens_seen": 47917500, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.08007812, + "step": 2233, + "time_per_iteration": 2.9983887672424316 + }, + { + "auxiliary_loss_clip": 0.01542027, + "auxiliary_loss_mlp": 0.01346815, + "balance_loss_clip": 1.17903578, + "balance_loss_mlp": 1.03591728, + "epoch": 0.2686226176877292, + "flos": 20122606746720.0, + "grad_norm": 1.8850252114599928, + "language_loss": 0.74106175, + "learning_rate": 3.4320013088462067e-06, + "loss": 0.76995015, + "num_input_tokens_seen": 47934860, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 3.10546875, + "step": 2234, + "time_per_iteration": 2.972219467163086 + }, + { + "auxiliary_loss_clip": 0.01539581, + "auxiliary_loss_mlp": 0.01335955, + "balance_loss_clip": 1.17433429, + "balance_loss_mlp": 1.03230476, + "epoch": 0.2687428605783683, + "flos": 21874222519200.0, + "grad_norm": 6.788374156316893, + "language_loss": 0.81693578, + "learning_rate": 3.431457400441499e-06, + "loss": 0.8456912, + "num_input_tokens_seen": 47955255, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.03125, + "step": 2235, + "time_per_iteration": 2.9920125007629395 + }, + { + "auxiliary_loss_clip": 0.01563798, + "auxiliary_loss_mlp": 0.01235931, + "balance_loss_clip": 1.19828606, + "balance_loss_mlp": 0.99255371, + "epoch": 0.2688631034690074, + "flos": 69949289907360.0, + "grad_norm": 0.9123575882947874, + "language_loss": 0.60755527, + "learning_rate": 3.4309132748858424e-06, + "loss": 0.63555264, + "num_input_tokens_seen": 48016245, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.4296875, + "step": 2236, + "time_per_iteration": 3.514702558517456 + }, + { + "auxiliary_loss_clip": 0.015416, + "auxiliary_loss_mlp": 0.01334719, + "balance_loss_clip": 1.17592394, + "balance_loss_mlp": 1.03526568, + "epoch": 0.2689833463596465, + "flos": 22858847389440.0, + "grad_norm": 1.6445307815508332, + "language_loss": 0.8377136, + "learning_rate": 3.430368932261779e-06, + "loss": 0.86647677, + "num_input_tokens_seen": 48036600, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.99023438, + "step": 2237, + "time_per_iteration": 3.017451524734497 + }, + { + "auxiliary_loss_clip": 0.01534327, + "auxiliary_loss_mlp": 0.01335401, + "balance_loss_clip": 1.16961884, + "balance_loss_mlp": 1.02965283, + "epoch": 0.2691035892502856, + "flos": 17202071352960.0, + "grad_norm": 2.694219752335018, + "language_loss": 0.7511133, + "learning_rate": 3.429824372651886e-06, + "loss": 0.77981055, + "num_input_tokens_seen": 48054750, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.05273438, + "step": 2238, + "time_per_iteration": 3.0428483486175537 + }, + { + "auxiliary_loss_clip": 0.01539842, + "auxiliary_loss_mlp": 0.01331761, + "balance_loss_clip": 1.17404485, + "balance_loss_mlp": 1.02315187, + "epoch": 0.26922383214092466, + "flos": 17749342238400.0, + "grad_norm": 2.2261864962139386, + "language_loss": 0.83294219, + "learning_rate": 3.4292795961387732e-06, + "loss": 0.86165822, + "num_input_tokens_seen": 48072650, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.08203125, + "step": 2239, + "time_per_iteration": 2.9897444248199463 + }, + { + "auxiliary_loss_clip": 0.01539211, + "auxiliary_loss_mlp": 0.0132995, + "balance_loss_clip": 1.17243278, + "balance_loss_mlp": 1.02439272, + "epoch": 0.26934407503156377, + "flos": 16174852794720.0, + "grad_norm": 2.4814229874431923, + "language_loss": 0.87797785, + "learning_rate": 3.4287346028050818e-06, + "loss": 0.9066695, + "num_input_tokens_seen": 48088720, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.05078125, + "step": 2240, + "time_per_iteration": 3.063767910003662 + }, + { + "auxiliary_loss_clip": 0.01533685, + "auxiliary_loss_mlp": 0.01336185, + "balance_loss_clip": 1.1682601, + "balance_loss_mlp": 1.03158188, + "epoch": 0.2694643179222028, + "flos": 23735186269920.0, + "grad_norm": 1.6129655734154105, + "language_loss": 0.79969263, + "learning_rate": 3.4281893927334866e-06, + "loss": 0.82839131, + "num_input_tokens_seen": 48108630, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.04101562, + "step": 2241, + "time_per_iteration": 3.818192720413208 + }, + { + "auxiliary_loss_clip": 0.01538532, + "auxiliary_loss_mlp": 0.01327631, + "balance_loss_clip": 1.17145205, + "balance_loss_mlp": 1.02855837, + "epoch": 0.26958456081284193, + "flos": 24720493847040.0, + "grad_norm": 2.23053624830629, + "language_loss": 0.75074649, + "learning_rate": 3.4276439660066963e-06, + "loss": 0.7794081, + "num_input_tokens_seen": 48128330, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.98632812, + "step": 2242, + "time_per_iteration": 3.9116241931915283 + }, + { + "auxiliary_loss_clip": 0.0153832, + "auxiliary_loss_mlp": 0.01338958, + "balance_loss_clip": 1.17269015, + "balance_loss_mlp": 1.03435445, + "epoch": 0.26970480370348104, + "flos": 18114480277920.0, + "grad_norm": 2.5742163606426365, + "language_loss": 0.84494209, + "learning_rate": 3.427098322707452e-06, + "loss": 0.87371492, + "num_input_tokens_seen": 48144295, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.04101562, + "step": 2243, + "time_per_iteration": 2.9771406650543213 + }, + { + "auxiliary_loss_clip": 0.0154229, + "auxiliary_loss_mlp": 0.01334349, + "balance_loss_clip": 1.17575741, + "balance_loss_mlp": 1.028983, + "epoch": 0.2698250465941201, + "flos": 10818091418400.0, + "grad_norm": 2.056184516641054, + "language_loss": 0.89894319, + "learning_rate": 3.426552462918526e-06, + "loss": 0.92770958, + "num_input_tokens_seen": 48162230, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.04882812, + "step": 2244, + "time_per_iteration": 3.058410882949829 + }, + { + "auxiliary_loss_clip": 0.0154841, + "auxiliary_loss_mlp": 0.01339554, + "balance_loss_clip": 1.1828227, + "balance_loss_mlp": 1.0324707, + "epoch": 0.2699452894847592, + "flos": 17309864276640.0, + "grad_norm": 2.446775255616869, + "language_loss": 0.72939533, + "learning_rate": 3.426006386722726e-06, + "loss": 0.75827491, + "num_input_tokens_seen": 48180290, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.06640625, + "step": 2245, + "time_per_iteration": 2.9743595123291016 + }, + { + "auxiliary_loss_clip": 0.01554124, + "auxiliary_loss_mlp": 0.01331946, + "balance_loss_clip": 1.18944633, + "balance_loss_mlp": 1.02619803, + "epoch": 0.2700655323753983, + "flos": 18080154928800.0, + "grad_norm": 1.9530753011134243, + "language_loss": 0.92092395, + "learning_rate": 3.4254600942028914e-06, + "loss": 0.9497847, + "num_input_tokens_seen": 48198165, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.05273438, + "step": 2246, + "time_per_iteration": 2.993215799331665 + }, + { + "auxiliary_loss_clip": 0.01552503, + "auxiliary_loss_mlp": 0.01324163, + "balance_loss_clip": 1.18819618, + "balance_loss_mlp": 1.02146685, + "epoch": 0.2701857752660374, + "flos": 18188592631200.0, + "grad_norm": 2.1945493829828124, + "language_loss": 0.82726526, + "learning_rate": 3.424913585441893e-06, + "loss": 0.85603195, + "num_input_tokens_seen": 48216000, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 3.02148438, + "step": 2247, + "time_per_iteration": 3.8322458267211914 + }, + { + "auxiliary_loss_clip": 0.01561436, + "auxiliary_loss_mlp": 0.01352198, + "balance_loss_clip": 1.19755101, + "balance_loss_mlp": 1.04149103, + "epoch": 0.2703060181566765, + "flos": 16320991452480.0, + "grad_norm": 1.846438752343516, + "language_loss": 0.87321472, + "learning_rate": 3.4243668605226374e-06, + "loss": 0.90235102, + "num_input_tokens_seen": 48233025, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 3.10351562, + "step": 2248, + "time_per_iteration": 3.909788131713867 + }, + { + "auxiliary_loss_clip": 0.01548531, + "auxiliary_loss_mlp": 0.01348801, + "balance_loss_clip": 1.18371189, + "balance_loss_mlp": 1.03904784, + "epoch": 0.2704262610473156, + "flos": 19574349729120.0, + "grad_norm": 2.6401779878436704, + "language_loss": 0.82335812, + "learning_rate": 3.423819919528061e-06, + "loss": 0.8523314, + "num_input_tokens_seen": 48251110, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.09375, + "step": 2249, + "time_per_iteration": 3.0606613159179688 + }, + { + "auxiliary_loss_clip": 0.01549409, + "auxiliary_loss_mlp": 0.01337335, + "balance_loss_clip": 1.18384242, + "balance_loss_mlp": 1.03521085, + "epoch": 0.27054650393795465, + "flos": 20742852140640.0, + "grad_norm": 2.4495409979119125, + "language_loss": 0.78383511, + "learning_rate": 3.4232727625411355e-06, + "loss": 0.81270254, + "num_input_tokens_seen": 48270215, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 3.015625, + "step": 2250, + "time_per_iteration": 2.948190450668335 + }, + { + "auxiliary_loss_clip": 0.01538573, + "auxiliary_loss_mlp": 0.01332714, + "balance_loss_clip": 1.17144883, + "balance_loss_mlp": 1.02887344, + "epoch": 0.27066674682859376, + "flos": 18660158180640.0, + "grad_norm": 1.7407406612227727, + "language_loss": 0.86649221, + "learning_rate": 3.4227253896448626e-06, + "loss": 0.89520502, + "num_input_tokens_seen": 48288075, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.03320312, + "step": 2251, + "time_per_iteration": 2.9412083625793457 + }, + { + "auxiliary_loss_clip": 0.01543122, + "auxiliary_loss_mlp": 0.01343563, + "balance_loss_clip": 1.17633927, + "balance_loss_mlp": 1.04010367, + "epoch": 0.2707869897192329, + "flos": 23004986047200.0, + "grad_norm": 2.3046951784870613, + "language_loss": 0.81874335, + "learning_rate": 3.42217780092228e-06, + "loss": 0.84761018, + "num_input_tokens_seen": 48306415, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.03125, + "step": 2252, + "time_per_iteration": 2.9597530364990234 + }, + { + "auxiliary_loss_clip": 0.01586013, + "auxiliary_loss_mlp": 0.01267967, + "balance_loss_clip": 1.22086215, + "balance_loss_mlp": 1.01696014, + "epoch": 0.27090723260987193, + "flos": 58329616376160.0, + "grad_norm": 0.7946756584369051, + "language_loss": 0.60259426, + "learning_rate": 3.421629996456456e-06, + "loss": 0.63113403, + "num_input_tokens_seen": 48365035, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.5078125, + "step": 2253, + "time_per_iteration": 3.3196496963500977 + }, + { + "auxiliary_loss_clip": 0.01551197, + "auxiliary_loss_mlp": 0.01342104, + "balance_loss_clip": 1.18709135, + "balance_loss_mlp": 1.03673708, + "epoch": 0.27102747550051104, + "flos": 11986176620160.0, + "grad_norm": 1.8314272381088237, + "language_loss": 0.82575548, + "learning_rate": 3.421081976330491e-06, + "loss": 0.85468853, + "num_input_tokens_seen": 48383550, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.04882812, + "step": 2254, + "time_per_iteration": 2.986637830734253 + }, + { + "auxiliary_loss_clip": 0.01555121, + "auxiliary_loss_mlp": 0.01345176, + "balance_loss_clip": 1.19279408, + "balance_loss_mlp": 1.03809285, + "epoch": 0.27114771839115015, + "flos": 19902393663840.0, + "grad_norm": 1.9158006862351498, + "language_loss": 0.87954557, + "learning_rate": 3.4205337406275207e-06, + "loss": 0.90854859, + "num_input_tokens_seen": 48403670, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 3.06640625, + "step": 2255, + "time_per_iteration": 2.9113447666168213 + }, + { + "auxiliary_loss_clip": 0.01545115, + "auxiliary_loss_mlp": 0.01334834, + "balance_loss_clip": 1.18159366, + "balance_loss_mlp": 1.02756023, + "epoch": 0.2712679612817892, + "flos": 18333896869440.0, + "grad_norm": 2.667297683257316, + "language_loss": 0.75300139, + "learning_rate": 3.4199852894307114e-06, + "loss": 0.78180093, + "num_input_tokens_seen": 48420420, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.06835938, + "step": 2256, + "time_per_iteration": 2.9304120540618896 + }, + { + "auxiliary_loss_clip": 0.01548365, + "auxiliary_loss_mlp": 0.01340798, + "balance_loss_clip": 1.18487751, + "balance_loss_mlp": 1.03276098, + "epoch": 0.2713882041724283, + "flos": 24462503952480.0, + "grad_norm": 2.0767739636501457, + "language_loss": 0.7879225, + "learning_rate": 3.419436622823262e-06, + "loss": 0.81681412, + "num_input_tokens_seen": 48441140, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 3.07617188, + "step": 2257, + "time_per_iteration": 2.9752068519592285 + }, + { + "auxiliary_loss_clip": 0.01558418, + "auxiliary_loss_mlp": 0.01346981, + "balance_loss_clip": 1.19556963, + "balance_loss_mlp": 1.04333091, + "epoch": 0.27150844706306737, + "flos": 23041549157760.0, + "grad_norm": 1.6414955946220275, + "language_loss": 0.74007553, + "learning_rate": 3.4188877408884063e-06, + "loss": 0.76912951, + "num_input_tokens_seen": 48461845, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 3.03125, + "step": 2258, + "time_per_iteration": 2.953151226043701 + }, + { + "auxiliary_loss_clip": 0.01547088, + "auxiliary_loss_mlp": 0.01337387, + "balance_loss_clip": 1.18405294, + "balance_loss_mlp": 1.0322113, + "epoch": 0.2716286899537065, + "flos": 22565773582560.0, + "grad_norm": 3.549445047279512, + "language_loss": 0.65318429, + "learning_rate": 3.4183386437094088e-06, + "loss": 0.68202901, + "num_input_tokens_seen": 48478510, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 3.046875, + "step": 2259, + "time_per_iteration": 2.9995858669281006 + }, + { + "auxiliary_loss_clip": 0.01546127, + "auxiliary_loss_mlp": 0.01340803, + "balance_loss_clip": 1.18197894, + "balance_loss_mlp": 1.03467369, + "epoch": 0.2717489328443456, + "flos": 13116712579200.0, + "grad_norm": 2.4523369015546277, + "language_loss": 0.82427597, + "learning_rate": 3.417789331369565e-06, + "loss": 0.8531453, + "num_input_tokens_seen": 48494300, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 3.05664062, + "step": 2260, + "time_per_iteration": 2.936772108078003 + }, + { + "auxiliary_loss_clip": 0.01550088, + "auxiliary_loss_mlp": 0.01336064, + "balance_loss_clip": 1.18723202, + "balance_loss_mlp": 1.03203273, + "epoch": 0.27186917573498465, + "flos": 29281286842560.0, + "grad_norm": 1.935239922099667, + "language_loss": 0.91393983, + "learning_rate": 3.4172398039522088e-06, + "loss": 0.94280136, + "num_input_tokens_seen": 48515585, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 3.03515625, + "step": 2261, + "time_per_iteration": 3.1680798530578613 + }, + { + "auxiliary_loss_clip": 0.01546519, + "auxiliary_loss_mlp": 0.01333959, + "balance_loss_clip": 1.1857245, + "balance_loss_mlp": 1.02802014, + "epoch": 0.27198941862562376, + "flos": 26034679778400.0, + "grad_norm": 2.3838672238805727, + "language_loss": 0.79918259, + "learning_rate": 3.4166900615407e-06, + "loss": 0.82798737, + "num_input_tokens_seen": 48533500, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 3.0546875, + "step": 2262, + "time_per_iteration": 3.0698041915893555 + }, + { + "auxiliary_loss_clip": 0.01551251, + "auxiliary_loss_mlp": 0.01338836, + "balance_loss_clip": 1.1892395, + "balance_loss_mlp": 1.03308797, + "epoch": 0.27210966151626287, + "flos": 32783759824320.0, + "grad_norm": 2.040734730793997, + "language_loss": 0.74876112, + "learning_rate": 3.416140104218436e-06, + "loss": 0.77766204, + "num_input_tokens_seen": 48552865, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 3.05273438, + "step": 2263, + "time_per_iteration": 3.004788875579834 + }, + { + "auxiliary_loss_clip": 0.01594938, + "auxiliary_loss_mlp": 0.01241562, + "balance_loss_clip": 1.23393381, + "balance_loss_mlp": 1.0019989, + "epoch": 0.2722299044069019, + "flos": 65477628816480.0, + "grad_norm": 1.802049187369619, + "language_loss": 0.69593012, + "learning_rate": 3.4155899320688437e-06, + "loss": 0.7242952, + "num_input_tokens_seen": 48618940, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.390625, + "step": 2264, + "time_per_iteration": 3.494220018386841 + }, + { + "auxiliary_loss_clip": 0.01558597, + "auxiliary_loss_mlp": 0.01345195, + "balance_loss_clip": 1.19920647, + "balance_loss_mlp": 1.03620458, + "epoch": 0.27235014729754103, + "flos": 15335570090880.0, + "grad_norm": 3.243640813478403, + "language_loss": 0.7427578, + "learning_rate": 3.415039545175384e-06, + "loss": 0.77179575, + "num_input_tokens_seen": 48634665, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 3.0859375, + "step": 2265, + "time_per_iteration": 2.9673681259155273 + }, + { + "auxiliary_loss_clip": 0.01555002, + "auxiliary_loss_mlp": 0.01331967, + "balance_loss_clip": 1.19395161, + "balance_loss_mlp": 1.02621925, + "epoch": 0.27247039018818014, + "flos": 21874563872640.0, + "grad_norm": 2.695631496322375, + "language_loss": 0.65042418, + "learning_rate": 3.414488943621551e-06, + "loss": 0.67929387, + "num_input_tokens_seen": 48653330, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 3.05273438, + "step": 2266, + "time_per_iteration": 2.961205005645752 + }, + { + "auxiliary_loss_clip": 0.01557779, + "auxiliary_loss_mlp": 0.01336132, + "balance_loss_clip": 1.19525361, + "balance_loss_mlp": 1.02828646, + "epoch": 0.2725906330788192, + "flos": 18697366069920.0, + "grad_norm": 1.9875818283072162, + "language_loss": 0.74142778, + "learning_rate": 3.41393812749087e-06, + "loss": 0.77036685, + "num_input_tokens_seen": 48671375, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 3.07421875, + "step": 2267, + "time_per_iteration": 2.973113536834717 + }, + { + "auxiliary_loss_clip": 0.01558077, + "auxiliary_loss_mlp": 0.01344276, + "balance_loss_clip": 1.19627774, + "balance_loss_mlp": 1.03871894, + "epoch": 0.2727108759694583, + "flos": 17887553910720.0, + "grad_norm": 3.6017612763153064, + "language_loss": 0.72122908, + "learning_rate": 3.4133870968668984e-06, + "loss": 0.75025266, + "num_input_tokens_seen": 48686175, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 3.05078125, + "step": 2268, + "time_per_iteration": 3.767462968826294 + }, + { + "auxiliary_loss_clip": 0.01566279, + "auxiliary_loss_mlp": 0.01345848, + "balance_loss_clip": 1.20732272, + "balance_loss_mlp": 1.03647649, + "epoch": 0.2728311188600974, + "flos": 24463945222560.0, + "grad_norm": 2.184043345537399, + "language_loss": 0.78947932, + "learning_rate": 3.412835851833229e-06, + "loss": 0.81860054, + "num_input_tokens_seen": 48708370, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 3.08984375, + "step": 2269, + "time_per_iteration": 3.7952282428741455 + }, + { + "auxiliary_loss_clip": 0.01558242, + "auxiliary_loss_mlp": 0.01336749, + "balance_loss_clip": 1.19500351, + "balance_loss_mlp": 1.03214526, + "epoch": 0.2729513617507365, + "flos": 30995998151040.0, + "grad_norm": 2.081232140039751, + "language_loss": 0.78173405, + "learning_rate": 3.4122843924734834e-06, + "loss": 0.81068397, + "num_input_tokens_seen": 48730670, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.04101562, + "step": 2270, + "time_per_iteration": 3.0194687843322754 + }, + { + "auxiliary_loss_clip": 0.01560805, + "auxiliary_loss_mlp": 0.01357888, + "balance_loss_clip": 1.19977534, + "balance_loss_mlp": 1.04927945, + "epoch": 0.2730716046413756, + "flos": 19096412248800.0, + "grad_norm": 4.468238979336331, + "language_loss": 0.88033152, + "learning_rate": 3.411732718871319e-06, + "loss": 0.90951848, + "num_input_tokens_seen": 48746510, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 3.08203125, + "step": 2271, + "time_per_iteration": 2.8923401832580566 + }, + { + "auxiliary_loss_clip": 0.01563309, + "auxiliary_loss_mlp": 0.01355485, + "balance_loss_clip": 1.20077825, + "balance_loss_mlp": 1.04973722, + "epoch": 0.27319184753201464, + "flos": 26947088703360.0, + "grad_norm": 1.6184908194105452, + "language_loss": 0.78895676, + "learning_rate": 3.4111808311104227e-06, + "loss": 0.81814474, + "num_input_tokens_seen": 48768825, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 3.0546875, + "step": 2272, + "time_per_iteration": 2.957134246826172 + }, + { + "auxiliary_loss_clip": 0.01560203, + "auxiliary_loss_mlp": 0.01355945, + "balance_loss_clip": 1.19946861, + "balance_loss_mlp": 1.04905248, + "epoch": 0.27331209042265375, + "flos": 31762571843520.0, + "grad_norm": 2.1061201201698756, + "language_loss": 0.6968832, + "learning_rate": 3.410628729274517e-06, + "loss": 0.72604465, + "num_input_tokens_seen": 48790345, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 3.06445312, + "step": 2273, + "time_per_iteration": 3.004972219467163 + }, + { + "auxiliary_loss_clip": 0.01560481, + "auxiliary_loss_mlp": 0.01338984, + "balance_loss_clip": 1.1997292, + "balance_loss_mlp": 1.03228199, + "epoch": 0.27343233331329286, + "flos": 25741454258880.0, + "grad_norm": 1.7739186394296693, + "language_loss": 0.82732618, + "learning_rate": 3.4100764134473546e-06, + "loss": 0.85632086, + "num_input_tokens_seen": 48809630, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 3.0625, + "step": 2274, + "time_per_iteration": 3.792080879211426 + }, + { + "auxiliary_loss_clip": 0.01565997, + "auxiliary_loss_mlp": 0.01350718, + "balance_loss_clip": 1.20299816, + "balance_loss_mlp": 1.04420733, + "epoch": 0.2735525762039319, + "flos": 24391729277280.0, + "grad_norm": 2.482527422352746, + "language_loss": 0.85354269, + "learning_rate": 3.4095238837127215e-06, + "loss": 0.88270986, + "num_input_tokens_seen": 48828770, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 3.06054688, + "step": 2275, + "time_per_iteration": 2.9757556915283203 + }, + { + "auxiliary_loss_clip": 0.0156345, + "auxiliary_loss_mlp": 0.01355445, + "balance_loss_clip": 1.20220006, + "balance_loss_mlp": 1.04702711, + "epoch": 0.27367281909457103, + "flos": 14467120267680.0, + "grad_norm": 10.65882944891064, + "language_loss": 0.79833162, + "learning_rate": 3.4089711401544355e-06, + "loss": 0.82752061, + "num_input_tokens_seen": 48846365, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 3.08007812, + "step": 2276, + "time_per_iteration": 3.7517318725585938 + }, + { + "auxiliary_loss_clip": 0.01564771, + "auxiliary_loss_mlp": 0.01332258, + "balance_loss_clip": 1.20211887, + "balance_loss_mlp": 1.02689135, + "epoch": 0.27379306198521014, + "flos": 23479244496000.0, + "grad_norm": 2.458325859393126, + "language_loss": 0.68277621, + "learning_rate": 3.4084181828563486e-06, + "loss": 0.71174657, + "num_input_tokens_seen": 48863085, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 3.04882812, + "step": 2277, + "time_per_iteration": 3.114001750946045 + }, + { + "auxiliary_loss_clip": 0.01569896, + "auxiliary_loss_mlp": 0.01345766, + "balance_loss_clip": 1.20925307, + "balance_loss_mlp": 1.03715706, + "epoch": 0.2739133048758492, + "flos": 17460137103840.0, + "grad_norm": 2.11112144968361, + "language_loss": 0.70936489, + "learning_rate": 3.4078650119023428e-06, + "loss": 0.73852152, + "num_input_tokens_seen": 48881400, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 3.08203125, + "step": 2278, + "time_per_iteration": 3.083005428314209 + }, + { + "auxiliary_loss_clip": 0.01561163, + "auxiliary_loss_mlp": 0.01338739, + "balance_loss_clip": 1.19960189, + "balance_loss_mlp": 1.03432655, + "epoch": 0.2740335477664883, + "flos": 19274676422400.0, + "grad_norm": 2.148110346430021, + "language_loss": 0.74379182, + "learning_rate": 3.4073116273763337e-06, + "loss": 0.77279085, + "num_input_tokens_seen": 48895845, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 3.0390625, + "step": 2279, + "time_per_iteration": 2.963975429534912 + }, + { + "auxiliary_loss_clip": 0.01567938, + "auxiliary_loss_mlp": 0.01343437, + "balance_loss_clip": 1.20704675, + "balance_loss_mlp": 1.03654444, + "epoch": 0.2741537906571274, + "flos": 26107161220800.0, + "grad_norm": 1.8206266583519253, + "language_loss": 0.81321907, + "learning_rate": 3.40675802936227e-06, + "loss": 0.84233284, + "num_input_tokens_seen": 48916630, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 3.06445312, + "step": 2280, + "time_per_iteration": 2.981527328491211 + }, + { + "auxiliary_loss_clip": 0.01562669, + "auxiliary_loss_mlp": 0.01337442, + "balance_loss_clip": 1.20242357, + "balance_loss_mlp": 1.0299778, + "epoch": 0.27427403354776647, + "flos": 34166861951040.0, + "grad_norm": 2.1743220538486403, + "language_loss": 0.7182079, + "learning_rate": 3.4062042179441318e-06, + "loss": 0.74720901, + "num_input_tokens_seen": 48937100, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 3.0703125, + "step": 2281, + "time_per_iteration": 3.202266216278076 + }, + { + "auxiliary_loss_clip": 0.0156807, + "auxiliary_loss_mlp": 0.01330092, + "balance_loss_clip": 1.20677221, + "balance_loss_mlp": 1.02510726, + "epoch": 0.2743942764384056, + "flos": 18768785523840.0, + "grad_norm": 2.779474408208594, + "language_loss": 0.80347455, + "learning_rate": 3.4056501932059314e-06, + "loss": 0.83245623, + "num_input_tokens_seen": 48955175, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 3.04492188, + "step": 2282, + "time_per_iteration": 2.954944133758545 + }, + { + "auxiliary_loss_clip": 0.01597887, + "auxiliary_loss_mlp": 0.01306015, + "balance_loss_clip": 1.23200059, + "balance_loss_mlp": 1.05271912, + "epoch": 0.2745145193290447, + "flos": 64909497443040.0, + "grad_norm": 0.7948333902391678, + "language_loss": 0.58135593, + "learning_rate": 3.405095955231715e-06, + "loss": 0.61039495, + "num_input_tokens_seen": 49006830, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.53125, + "step": 2283, + "time_per_iteration": 3.460399866104126 + }, + { + "auxiliary_loss_clip": 0.01560917, + "auxiliary_loss_mlp": 0.0133696, + "balance_loss_clip": 1.19955039, + "balance_loss_mlp": 1.03712463, + "epoch": 0.27463476221968375, + "flos": 16138403468640.0, + "grad_norm": 16.406322355713268, + "language_loss": 0.9484179, + "learning_rate": 3.4045415041055585e-06, + "loss": 0.97739673, + "num_input_tokens_seen": 49022470, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.99414062, + "step": 2284, + "time_per_iteration": 2.990840196609497 + }, + { + "auxiliary_loss_clip": 0.0156493, + "auxiliary_loss_mlp": 0.01338862, + "balance_loss_clip": 1.2040683, + "balance_loss_mlp": 1.03406715, + "epoch": 0.27475500511032286, + "flos": 10378120390560.0, + "grad_norm": 2.768918014054715, + "language_loss": 0.79212272, + "learning_rate": 3.4039868399115728e-06, + "loss": 0.82116067, + "num_input_tokens_seen": 49037110, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 3.04296875, + "step": 2285, + "time_per_iteration": 2.926567792892456 + }, + { + "auxiliary_loss_clip": 0.01571915, + "auxiliary_loss_mlp": 0.01346737, + "balance_loss_clip": 1.21304703, + "balance_loss_mlp": 1.04232454, + "epoch": 0.27487524800096197, + "flos": 17312974385760.0, + "grad_norm": 2.019375788974501, + "language_loss": 0.8070296, + "learning_rate": 3.4034319627339003e-06, + "loss": 0.83621615, + "num_input_tokens_seen": 49053975, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 3.0390625, + "step": 2286, + "time_per_iteration": 2.9538729190826416 + }, + { + "auxiliary_loss_clip": 0.01568535, + "auxiliary_loss_mlp": 0.01346878, + "balance_loss_clip": 1.20642328, + "balance_loss_mlp": 1.0438, + "epoch": 0.274995490891601, + "flos": 27122356552320.0, + "grad_norm": 2.997792092469478, + "language_loss": 0.69642758, + "learning_rate": 3.402876872656715e-06, + "loss": 0.72558165, + "num_input_tokens_seen": 49072295, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 3.02539062, + "step": 2287, + "time_per_iteration": 2.934696674346924 + }, + { + "auxiliary_loss_clip": 0.01567977, + "auxiliary_loss_mlp": 0.01340239, + "balance_loss_clip": 1.20721948, + "balance_loss_mlp": 1.03811502, + "epoch": 0.27511573378224013, + "flos": 23438092078080.0, + "grad_norm": 1.840978670777485, + "language_loss": 0.89766836, + "learning_rate": 3.402321569764223e-06, + "loss": 0.92675054, + "num_input_tokens_seen": 49091600, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 3.015625, + "step": 2288, + "time_per_iteration": 2.970435857772827 + }, + { + "auxiliary_loss_clip": 0.01561236, + "auxiliary_loss_mlp": 0.01340654, + "balance_loss_clip": 1.19997525, + "balance_loss_mlp": 1.03509676, + "epoch": 0.2752359766728792, + "flos": 16723602878400.0, + "grad_norm": 1.9207153671517967, + "language_loss": 0.83705562, + "learning_rate": 3.4017660541406635e-06, + "loss": 0.8660745, + "num_input_tokens_seen": 49107665, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 3.05078125, + "step": 2289, + "time_per_iteration": 2.9450831413269043 + }, + { + "auxiliary_loss_clip": 0.01566704, + "auxiliary_loss_mlp": 0.01341548, + "balance_loss_clip": 1.20557737, + "balance_loss_mlp": 1.03808904, + "epoch": 0.2753562195635183, + "flos": 25299890248320.0, + "grad_norm": 1.800749464284512, + "language_loss": 0.74608094, + "learning_rate": 3.4012103258703092e-06, + "loss": 0.77516353, + "num_input_tokens_seen": 49126420, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 3.02929688, + "step": 2290, + "time_per_iteration": 2.939547538757324 + }, + { + "auxiliary_loss_clip": 0.0156664, + "auxiliary_loss_mlp": 0.01336798, + "balance_loss_clip": 1.207546, + "balance_loss_mlp": 1.03200376, + "epoch": 0.2754764624541574, + "flos": 27341128365120.0, + "grad_norm": 1.9979058800022698, + "language_loss": 0.83572972, + "learning_rate": 3.4006543850374616e-06, + "loss": 0.86476409, + "num_input_tokens_seen": 49141470, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 3.04296875, + "step": 2291, + "time_per_iteration": 2.9950084686279297 + }, + { + "auxiliary_loss_clip": 0.01561613, + "auxiliary_loss_mlp": 0.01344253, + "balance_loss_clip": 1.20049071, + "balance_loss_mlp": 1.0404129, + "epoch": 0.27559670534479647, + "flos": 17240303302560.0, + "grad_norm": 2.1066900283940733, + "language_loss": 0.75423634, + "learning_rate": 3.400098231726458e-06, + "loss": 0.78329498, + "num_input_tokens_seen": 49158570, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 3.03320312, + "step": 2292, + "time_per_iteration": 2.8971996307373047 + }, + { + "auxiliary_loss_clip": 0.01565853, + "auxiliary_loss_mlp": 0.01335524, + "balance_loss_clip": 1.2064364, + "balance_loss_mlp": 1.0288223, + "epoch": 0.2757169482354356, + "flos": 21940938881280.0, + "grad_norm": 2.272528403662784, + "language_loss": 0.87003815, + "learning_rate": 3.3995418660216657e-06, + "loss": 0.8990519, + "num_input_tokens_seen": 49176025, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 3.0625, + "step": 2293, + "time_per_iteration": 2.9817798137664795 + }, + { + "auxiliary_loss_clip": 0.01563623, + "auxiliary_loss_mlp": 0.01352014, + "balance_loss_clip": 1.20375943, + "balance_loss_mlp": 1.04454899, + "epoch": 0.2758371911260747, + "flos": 20852806969440.0, + "grad_norm": 2.4026541099374144, + "language_loss": 0.8072542, + "learning_rate": 3.3989852880074848e-06, + "loss": 0.83641052, + "num_input_tokens_seen": 49197455, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 3.0703125, + "step": 2294, + "time_per_iteration": 3.077122688293457 + }, + { + "auxiliary_loss_clip": 0.01594326, + "auxiliary_loss_mlp": 0.01240387, + "balance_loss_clip": 1.2301147, + "balance_loss_mlp": 1.00311279, + "epoch": 0.27595743401671374, + "flos": 69276058145280.0, + "grad_norm": 0.908308971940424, + "language_loss": 0.60528612, + "learning_rate": 3.398428497768348e-06, + "loss": 0.63363326, + "num_input_tokens_seen": 49262625, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.3671875, + "step": 2295, + "time_per_iteration": 3.5381920337677 + }, + { + "auxiliary_loss_clip": 0.01564004, + "auxiliary_loss_mlp": 0.01330662, + "balance_loss_clip": 1.2052629, + "balance_loss_mlp": 1.02701235, + "epoch": 0.27607767690735285, + "flos": 21217148517600.0, + "grad_norm": 1.9147382175389822, + "language_loss": 0.72076046, + "learning_rate": 3.3978714953887205e-06, + "loss": 0.74970716, + "num_input_tokens_seen": 49282380, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 3.03125, + "step": 2296, + "time_per_iteration": 3.899867534637451 + }, + { + "auxiliary_loss_clip": 0.01560009, + "auxiliary_loss_mlp": 0.0134251, + "balance_loss_clip": 1.19873953, + "balance_loss_mlp": 1.03638041, + "epoch": 0.27619791979799196, + "flos": 24827566135680.0, + "grad_norm": 1.953220047381283, + "language_loss": 0.86119497, + "learning_rate": 3.397314280953098e-06, + "loss": 0.89022017, + "num_input_tokens_seen": 49303205, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 3.05664062, + "step": 2297, + "time_per_iteration": 3.9615695476531982 + }, + { + "auxiliary_loss_clip": 0.0155921, + "auxiliary_loss_mlp": 0.01338117, + "balance_loss_clip": 1.19854081, + "balance_loss_mlp": 1.02988982, + "epoch": 0.276318162688631, + "flos": 24755843256480.0, + "grad_norm": 2.351149300406926, + "language_loss": 0.80187166, + "learning_rate": 3.3967568545460108e-06, + "loss": 0.83084494, + "num_input_tokens_seen": 49322745, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 3.078125, + "step": 2298, + "time_per_iteration": 3.0081920623779297 + }, + { + "auxiliary_loss_clip": 0.01562814, + "auxiliary_loss_mlp": 0.01360477, + "balance_loss_clip": 1.20085204, + "balance_loss_mlp": 1.05396581, + "epoch": 0.27643840557927013, + "flos": 18151953664320.0, + "grad_norm": 2.034640452128888, + "language_loss": 0.80570471, + "learning_rate": 3.3961992162520185e-06, + "loss": 0.83493763, + "num_input_tokens_seen": 49341370, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 3.0625, + "step": 2299, + "time_per_iteration": 3.0328521728515625 + }, + { + "auxiliary_loss_clip": 0.01556757, + "auxiliary_loss_mlp": 0.01348687, + "balance_loss_clip": 1.1943109, + "balance_loss_mlp": 1.04141271, + "epoch": 0.27655864846990924, + "flos": 24826086937440.0, + "grad_norm": 2.3080934585007316, + "language_loss": 0.7176863, + "learning_rate": 3.3956413661557156e-06, + "loss": 0.74674076, + "num_input_tokens_seen": 49361545, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 3.06835938, + "step": 2300, + "time_per_iteration": 3.026181697845459 + }, + { + "auxiliary_loss_clip": 0.01559155, + "auxiliary_loss_mlp": 0.01342501, + "balance_loss_clip": 1.19864988, + "balance_loss_mlp": 1.03465533, + "epoch": 0.2766788913605483, + "flos": 20268669548160.0, + "grad_norm": 2.4105755427844064, + "language_loss": 0.65943301, + "learning_rate": 3.3950833043417273e-06, + "loss": 0.68844962, + "num_input_tokens_seen": 49379690, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 3.07421875, + "step": 2301, + "time_per_iteration": 2.981005907058716 + }, + { + "auxiliary_loss_clip": 0.01567791, + "auxiliary_loss_mlp": 0.01364181, + "balance_loss_clip": 1.20523584, + "balance_loss_mlp": 1.05461812, + "epoch": 0.2767991342511874, + "flos": 21472407584640.0, + "grad_norm": 10.578430437445782, + "language_loss": 0.73506498, + "learning_rate": 3.3945250308947105e-06, + "loss": 0.76438475, + "num_input_tokens_seen": 49395995, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 3.09179688, + "step": 2302, + "time_per_iteration": 3.9492294788360596 + }, + { + "auxiliary_loss_clip": 0.01600609, + "auxiliary_loss_mlp": 0.01304367, + "balance_loss_clip": 1.23313344, + "balance_loss_mlp": 1.05412292, + "epoch": 0.2769193771418265, + "flos": 66008363024160.0, + "grad_norm": 1.2929077348527338, + "language_loss": 0.68320787, + "learning_rate": 3.3939665458993556e-06, + "loss": 0.71225762, + "num_input_tokens_seen": 49450415, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.5, + "step": 2303, + "time_per_iteration": 4.287232398986816 + }, + { + "auxiliary_loss_clip": 0.01558984, + "auxiliary_loss_mlp": 0.01350466, + "balance_loss_clip": 1.19564605, + "balance_loss_mlp": 1.04128456, + "epoch": 0.27703962003246557, + "flos": 20706554527200.0, + "grad_norm": 2.2159468453697775, + "language_loss": 0.76992631, + "learning_rate": 3.3934078494403843e-06, + "loss": 0.79902077, + "num_input_tokens_seen": 49469990, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.08789062, + "step": 2304, + "time_per_iteration": 3.1157288551330566 + }, + { + "auxiliary_loss_clip": 0.01567814, + "auxiliary_loss_mlp": 0.01349496, + "balance_loss_clip": 1.20635247, + "balance_loss_mlp": 1.04241276, + "epoch": 0.2771598629231047, + "flos": 22932125323200.0, + "grad_norm": 1.693232528941554, + "language_loss": 0.81418478, + "learning_rate": 3.3928489416025495e-06, + "loss": 0.84335786, + "num_input_tokens_seen": 49490835, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 3.06640625, + "step": 2305, + "time_per_iteration": 3.0934176445007324 + }, + { + "auxiliary_loss_clip": 0.01559219, + "auxiliary_loss_mlp": 0.0134898, + "balance_loss_clip": 1.19663417, + "balance_loss_mlp": 1.04189706, + "epoch": 0.27728010581374374, + "flos": 18371446112160.0, + "grad_norm": 2.1189440706473275, + "language_loss": 0.79172134, + "learning_rate": 3.392289822470638e-06, + "loss": 0.82080328, + "num_input_tokens_seen": 49508815, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 3.06640625, + "step": 2306, + "time_per_iteration": 3.080366849899292 + }, + { + "auxiliary_loss_clip": 0.01558597, + "auxiliary_loss_mlp": 0.0132945, + "balance_loss_clip": 1.19613326, + "balance_loss_mlp": 1.02732587, + "epoch": 0.27740034870438285, + "flos": 19429690269600.0, + "grad_norm": 5.065574171650585, + "language_loss": 0.7604776, + "learning_rate": 3.3917304921294674e-06, + "loss": 0.78935802, + "num_input_tokens_seen": 49526980, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 3.015625, + "step": 2307, + "time_per_iteration": 2.995534896850586 + }, + { + "auxiliary_loss_clip": 0.01557264, + "auxiliary_loss_mlp": 0.01340667, + "balance_loss_clip": 1.19467688, + "balance_loss_mlp": 1.03339314, + "epoch": 0.27752059159502196, + "flos": 21616839475200.0, + "grad_norm": 2.2287065203540113, + "language_loss": 0.81236893, + "learning_rate": 3.3911709506638876e-06, + "loss": 0.84134817, + "num_input_tokens_seen": 49546290, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 3.06835938, + "step": 2308, + "time_per_iteration": 2.9736742973327637 + }, + { + "auxiliary_loss_clip": 0.01555188, + "auxiliary_loss_mlp": 0.01338488, + "balance_loss_clip": 1.19192123, + "balance_loss_mlp": 1.03140521, + "epoch": 0.277640834485661, + "flos": 26610055794720.0, + "grad_norm": 2.3678645079878664, + "language_loss": 0.81643057, + "learning_rate": 3.390611198158781e-06, + "loss": 0.84536731, + "num_input_tokens_seen": 49564165, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 3.06640625, + "step": 2309, + "time_per_iteration": 2.9563333988189697 + }, + { + "auxiliary_loss_clip": 0.01569672, + "auxiliary_loss_mlp": 0.01337804, + "balance_loss_clip": 1.20597458, + "balance_loss_mlp": 1.03262782, + "epoch": 0.2777610773763001, + "flos": 19494434367360.0, + "grad_norm": 2.1350032988930527, + "language_loss": 0.90386677, + "learning_rate": 3.3900512346990612e-06, + "loss": 0.93294156, + "num_input_tokens_seen": 49580155, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 3.046875, + "step": 2310, + "time_per_iteration": 2.910506010055542 + }, + { + "auxiliary_loss_clip": 0.01561126, + "auxiliary_loss_mlp": 0.01343458, + "balance_loss_clip": 1.19909549, + "balance_loss_mlp": 1.03542137, + "epoch": 0.27788132026693924, + "flos": 38293752424320.0, + "grad_norm": 2.1447451334263157, + "language_loss": 0.66238952, + "learning_rate": 3.389491060369674e-06, + "loss": 0.69143534, + "num_input_tokens_seen": 49605830, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 3.07617188, + "step": 2311, + "time_per_iteration": 3.0831820964813232 + }, + { + "auxiliary_loss_clip": 0.0157213, + "auxiliary_loss_mlp": 0.01331831, + "balance_loss_clip": 1.20928168, + "balance_loss_mlp": 1.02837229, + "epoch": 0.2780015631575783, + "flos": 22384740653280.0, + "grad_norm": 2.786792737548567, + "language_loss": 0.89519596, + "learning_rate": 3.388930675255598e-06, + "loss": 0.92423558, + "num_input_tokens_seen": 49625680, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 3.02929688, + "step": 2312, + "time_per_iteration": 2.934870958328247 + }, + { + "auxiliary_loss_clip": 0.01570164, + "auxiliary_loss_mlp": 0.01336341, + "balance_loss_clip": 1.20732975, + "balance_loss_mlp": 1.03154635, + "epoch": 0.2781218060482174, + "flos": 12204834648480.0, + "grad_norm": 4.967793555527945, + "language_loss": 0.79679006, + "learning_rate": 3.388370079441843e-06, + "loss": 0.82585514, + "num_input_tokens_seen": 49641195, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 3.04296875, + "step": 2313, + "time_per_iteration": 2.9932620525360107 + }, + { + "auxiliary_loss_clip": 0.01575435, + "auxiliary_loss_mlp": 0.01338497, + "balance_loss_clip": 1.21321225, + "balance_loss_mlp": 1.03522873, + "epoch": 0.2782420489388565, + "flos": 18109246191840.0, + "grad_norm": 2.183000196201973, + "language_loss": 0.93266529, + "learning_rate": 3.3878092730134505e-06, + "loss": 0.96180463, + "num_input_tokens_seen": 49659180, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 3.02929688, + "step": 2314, + "time_per_iteration": 2.973771095275879 + }, + { + "auxiliary_loss_clip": 0.01572013, + "auxiliary_loss_mlp": 0.01351228, + "balance_loss_clip": 1.21081543, + "balance_loss_mlp": 1.04261899, + "epoch": 0.27836229182949557, + "flos": 18516712422240.0, + "grad_norm": 1.7735936899224949, + "language_loss": 0.80416179, + "learning_rate": 3.3872482560554947e-06, + "loss": 0.83339417, + "num_input_tokens_seen": 49677955, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 3.08203125, + "step": 2315, + "time_per_iteration": 3.0399084091186523 + }, + { + "auxiliary_loss_clip": 0.01585448, + "auxiliary_loss_mlp": 0.01233337, + "balance_loss_clip": 1.21707511, + "balance_loss_mlp": 0.9914856, + "epoch": 0.2784825347201347, + "flos": 67086064327680.0, + "grad_norm": 0.8004709298143355, + "language_loss": 0.56894433, + "learning_rate": 3.386687028653082e-06, + "loss": 0.59713221, + "num_input_tokens_seen": 49740800, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 2.4140625, + "step": 2316, + "time_per_iteration": 3.547989845275879 + }, + { + "auxiliary_loss_clip": 0.0156934, + "auxiliary_loss_mlp": 0.01346233, + "balance_loss_clip": 1.20713937, + "balance_loss_mlp": 1.04105687, + "epoch": 0.2786027776107738, + "flos": 22632945082560.0, + "grad_norm": 1.8163386516112603, + "language_loss": 0.84872144, + "learning_rate": 3.386125590891349e-06, + "loss": 0.87787718, + "num_input_tokens_seen": 49757675, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 3.046875, + "step": 2317, + "time_per_iteration": 3.062490463256836 + }, + { + "auxiliary_loss_clip": 0.01570727, + "auxiliary_loss_mlp": 0.01332503, + "balance_loss_clip": 1.20906234, + "balance_loss_mlp": 1.02751851, + "epoch": 0.27872302050141284, + "flos": 15780661420320.0, + "grad_norm": 2.370906699778962, + "language_loss": 0.8305704, + "learning_rate": 3.3855639428554657e-06, + "loss": 0.85960269, + "num_input_tokens_seen": 49775205, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 3.04492188, + "step": 2318, + "time_per_iteration": 3.0607895851135254 + }, + { + "auxiliary_loss_clip": 0.01566159, + "auxiliary_loss_mlp": 0.01352203, + "balance_loss_clip": 1.20374107, + "balance_loss_mlp": 1.04512012, + "epoch": 0.27884326339205195, + "flos": 22129140232800.0, + "grad_norm": 1.9839128313818097, + "language_loss": 0.81006718, + "learning_rate": 3.385002084630635e-06, + "loss": 0.83925074, + "num_input_tokens_seen": 49794175, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 3.06640625, + "step": 2319, + "time_per_iteration": 3.0259006023406982 + }, + { + "auxiliary_loss_clip": 0.01570835, + "auxiliary_loss_mlp": 0.01340505, + "balance_loss_clip": 1.20897889, + "balance_loss_mlp": 1.03399432, + "epoch": 0.278963506282691, + "flos": 20560833079200.0, + "grad_norm": 2.141523169578944, + "language_loss": 0.8498348, + "learning_rate": 3.384440016302088e-06, + "loss": 0.87894821, + "num_input_tokens_seen": 49812850, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 3.0625, + "step": 2320, + "time_per_iteration": 2.922464370727539 + }, + { + "auxiliary_loss_clip": 0.01556566, + "auxiliary_loss_mlp": 0.01328348, + "balance_loss_clip": 1.19338584, + "balance_loss_mlp": 1.02527046, + "epoch": 0.2790837491733301, + "flos": 21944997194400.0, + "grad_norm": 2.1049166476967094, + "language_loss": 0.61977082, + "learning_rate": 3.3838777379550923e-06, + "loss": 0.64861995, + "num_input_tokens_seen": 49832295, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.02734375, + "step": 2321, + "time_per_iteration": 2.998215675354004 + }, + { + "auxiliary_loss_clip": 0.01550819, + "auxiliary_loss_mlp": 0.01335181, + "balance_loss_clip": 1.18702888, + "balance_loss_mlp": 1.03286624, + "epoch": 0.27920399206396923, + "flos": 26289218210400.0, + "grad_norm": 1.9306786841830927, + "language_loss": 0.78632218, + "learning_rate": 3.383315249674944e-06, + "loss": 0.81518215, + "num_input_tokens_seen": 49850860, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 3.01757812, + "step": 2322, + "time_per_iteration": 3.0626602172851562 + }, + { + "auxiliary_loss_clip": 0.01558779, + "auxiliary_loss_mlp": 0.01341344, + "balance_loss_clip": 1.19702077, + "balance_loss_mlp": 1.037503, + "epoch": 0.2793242349546083, + "flos": 25402790439360.0, + "grad_norm": 2.879742928925392, + "language_loss": 0.86092818, + "learning_rate": 3.3827525515469715e-06, + "loss": 0.88992941, + "num_input_tokens_seen": 49865765, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 3.03320312, + "step": 2323, + "time_per_iteration": 3.8063580989837646 + }, + { + "auxiliary_loss_clip": 0.01560527, + "auxiliary_loss_mlp": 0.01339489, + "balance_loss_clip": 1.19692791, + "balance_loss_mlp": 1.03393221, + "epoch": 0.2794444778452474, + "flos": 20852465616000.0, + "grad_norm": 2.0789668647908335, + "language_loss": 0.71216398, + "learning_rate": 3.3821896436565367e-06, + "loss": 0.74116421, + "num_input_tokens_seen": 49885425, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 3.05078125, + "step": 2324, + "time_per_iteration": 3.9138481616973877 + }, + { + "auxiliary_loss_clip": 0.01556646, + "auxiliary_loss_mlp": 0.01349087, + "balance_loss_clip": 1.19372213, + "balance_loss_mlp": 1.0406692, + "epoch": 0.2795647207358865, + "flos": 21578190315840.0, + "grad_norm": 2.0144832502711036, + "language_loss": 0.70438206, + "learning_rate": 3.381626526089032e-06, + "loss": 0.73343939, + "num_input_tokens_seen": 49904990, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 3.08007812, + "step": 2325, + "time_per_iteration": 2.9834353923797607 + }, + { + "auxiliary_loss_clip": 0.01551365, + "auxiliary_loss_mlp": 0.01327612, + "balance_loss_clip": 1.18812799, + "balance_loss_mlp": 1.02338982, + "epoch": 0.27968496362652556, + "flos": 21473962639200.0, + "grad_norm": 2.326181568495115, + "language_loss": 0.79249424, + "learning_rate": 3.3810631989298815e-06, + "loss": 0.821284, + "num_input_tokens_seen": 49924600, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 3.03710938, + "step": 2326, + "time_per_iteration": 2.9229581356048584 + }, + { + "auxiliary_loss_clip": 0.01553573, + "auxiliary_loss_mlp": 0.01352269, + "balance_loss_clip": 1.19215369, + "balance_loss_mlp": 1.04423261, + "epoch": 0.2798052065171647, + "flos": 23260965749280.0, + "grad_norm": 2.0683943509189695, + "language_loss": 0.84458709, + "learning_rate": 3.3804996622645423e-06, + "loss": 0.87364542, + "num_input_tokens_seen": 49942600, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 3.07617188, + "step": 2327, + "time_per_iteration": 2.9865753650665283 + }, + { + "auxiliary_loss_clip": 0.01550178, + "auxiliary_loss_mlp": 0.01339693, + "balance_loss_clip": 1.18792152, + "balance_loss_mlp": 1.03394544, + "epoch": 0.2799254494078038, + "flos": 21541247923680.0, + "grad_norm": 2.7422420711760123, + "language_loss": 0.89340925, + "learning_rate": 3.3799359161785015e-06, + "loss": 0.92230797, + "num_input_tokens_seen": 49962250, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 3.05273438, + "step": 2328, + "time_per_iteration": 2.9861934185028076 + }, + { + "auxiliary_loss_clip": 0.01546151, + "auxiliary_loss_mlp": 0.01332672, + "balance_loss_clip": 1.18471062, + "balance_loss_mlp": 1.02921259, + "epoch": 0.28004569229844284, + "flos": 26396214642720.0, + "grad_norm": 1.9225534640139144, + "language_loss": 0.85870194, + "learning_rate": 3.3793719607572798e-06, + "loss": 0.88749015, + "num_input_tokens_seen": 49983215, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 3.02929688, + "step": 2329, + "time_per_iteration": 4.592103481292725 + }, + { + "auxiliary_loss_clip": 0.01552534, + "auxiliary_loss_mlp": 0.01348426, + "balance_loss_clip": 1.1903528, + "balance_loss_mlp": 1.04344141, + "epoch": 0.28016593518908195, + "flos": 33550523157600.0, + "grad_norm": 2.9197772576597303, + "language_loss": 0.77063787, + "learning_rate": 3.378807796086428e-06, + "loss": 0.79964745, + "num_input_tokens_seen": 50006075, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 3.04492188, + "step": 2330, + "time_per_iteration": 3.1016976833343506 + }, + { + "auxiliary_loss_clip": 0.01554479, + "auxiliary_loss_mlp": 0.01333061, + "balance_loss_clip": 1.19074082, + "balance_loss_mlp": 1.02883875, + "epoch": 0.28028617807972106, + "flos": 15342624728640.0, + "grad_norm": 2.186045305050802, + "language_loss": 0.76880139, + "learning_rate": 3.37824342225153e-06, + "loss": 0.7976768, + "num_input_tokens_seen": 50022495, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 3.03710938, + "step": 2331, + "time_per_iteration": 3.001406192779541 + }, + { + "auxiliary_loss_clip": 0.01555816, + "auxiliary_loss_mlp": 0.01334317, + "balance_loss_clip": 1.19521916, + "balance_loss_mlp": 1.03009534, + "epoch": 0.2804064209703601, + "flos": 25522644517920.0, + "grad_norm": 2.0125668244844572, + "language_loss": 0.77869022, + "learning_rate": 3.3776788393382006e-06, + "loss": 0.8075915, + "num_input_tokens_seen": 50041975, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 3.0390625, + "step": 2332, + "time_per_iteration": 3.0233352184295654 + }, + { + "auxiliary_loss_clip": 0.01548249, + "auxiliary_loss_mlp": 0.01344442, + "balance_loss_clip": 1.18525457, + "balance_loss_mlp": 1.03602409, + "epoch": 0.2805266638609992, + "flos": 29354375135520.0, + "grad_norm": 3.345089742815554, + "language_loss": 0.77269208, + "learning_rate": 3.3771140474320872e-06, + "loss": 0.80161905, + "num_input_tokens_seen": 50061925, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 3.08007812, + "step": 2333, + "time_per_iteration": 3.0929949283599854 + }, + { + "auxiliary_loss_clip": 0.01554259, + "auxiliary_loss_mlp": 0.01355718, + "balance_loss_clip": 1.19101357, + "balance_loss_mlp": 1.04882598, + "epoch": 0.28064690675163834, + "flos": 21465504659520.0, + "grad_norm": 2.772283242395803, + "language_loss": 0.79635608, + "learning_rate": 3.3765490466188664e-06, + "loss": 0.8254559, + "num_input_tokens_seen": 50079325, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.06445312, + "step": 2334, + "time_per_iteration": 2.9755520820617676 + }, + { + "auxiliary_loss_clip": 0.01551186, + "auxiliary_loss_mlp": 0.01335859, + "balance_loss_clip": 1.18889451, + "balance_loss_mlp": 1.03297234, + "epoch": 0.2807671496422774, + "flos": 20997731926080.0, + "grad_norm": 2.9169804493948766, + "language_loss": 0.7379294, + "learning_rate": 3.3759838369842508e-06, + "loss": 0.76679987, + "num_input_tokens_seen": 50097400, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 3.0234375, + "step": 2335, + "time_per_iteration": 3.028252601623535 + }, + { + "auxiliary_loss_clip": 0.01549329, + "auxiliary_loss_mlp": 0.01332969, + "balance_loss_clip": 1.18804669, + "balance_loss_mlp": 1.0279839, + "epoch": 0.2808873925329165, + "flos": 21508705198080.0, + "grad_norm": 2.0434523031592216, + "language_loss": 0.73286611, + "learning_rate": 3.375418418613981e-06, + "loss": 0.76168907, + "num_input_tokens_seen": 50116425, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 3.04492188, + "step": 2336, + "time_per_iteration": 3.0402748584747314 + }, + { + "auxiliary_loss_clip": 0.01544261, + "auxiliary_loss_mlp": 0.01333445, + "balance_loss_clip": 1.18095541, + "balance_loss_mlp": 1.02807808, + "epoch": 0.28100763542355556, + "flos": 16072673238720.0, + "grad_norm": 2.303858235641995, + "language_loss": 0.8377645, + "learning_rate": 3.374852791593831e-06, + "loss": 0.86654162, + "num_input_tokens_seen": 50132625, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.04882812, + "step": 2337, + "time_per_iteration": 2.9457285404205322 + }, + { + "auxiliary_loss_clip": 0.01548652, + "auxiliary_loss_mlp": 0.01340873, + "balance_loss_clip": 1.18687701, + "balance_loss_mlp": 1.03340793, + "epoch": 0.28112787831419467, + "flos": 19064628086400.0, + "grad_norm": 6.764868462854059, + "language_loss": 0.54558122, + "learning_rate": 3.374286956009605e-06, + "loss": 0.57447648, + "num_input_tokens_seen": 50151190, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 3.0703125, + "step": 2338, + "time_per_iteration": 3.030339241027832 + }, + { + "auxiliary_loss_clip": 0.01549878, + "auxiliary_loss_mlp": 0.01337053, + "balance_loss_clip": 1.1869216, + "balance_loss_mlp": 1.03187752, + "epoch": 0.2812481212048338, + "flos": 12825307611360.0, + "grad_norm": 2.59230574167307, + "language_loss": 0.75526035, + "learning_rate": 3.3737209119471405e-06, + "loss": 0.78412974, + "num_input_tokens_seen": 50167700, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 3.046875, + "step": 2339, + "time_per_iteration": 2.96600341796875 + }, + { + "auxiliary_loss_clip": 0.01550982, + "auxiliary_loss_mlp": 0.0134226, + "balance_loss_clip": 1.18956625, + "balance_loss_mlp": 1.03441429, + "epoch": 0.28136836409547283, + "flos": 15634902044160.0, + "grad_norm": 3.515449976351007, + "language_loss": 0.63627976, + "learning_rate": 3.373154659492306e-06, + "loss": 0.66521215, + "num_input_tokens_seen": 50185840, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 3.07421875, + "step": 2340, + "time_per_iteration": 3.0114619731903076 + }, + { + "auxiliary_loss_clip": 0.0154487, + "auxiliary_loss_mlp": 0.01332291, + "balance_loss_clip": 1.1820941, + "balance_loss_mlp": 1.02959526, + "epoch": 0.28148860698611194, + "flos": 19935543240000.0, + "grad_norm": 9.252231540524427, + "language_loss": 0.85271227, + "learning_rate": 3.3725881987310016e-06, + "loss": 0.88148385, + "num_input_tokens_seen": 50203375, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 3.02148438, + "step": 2341, + "time_per_iteration": 3.0658910274505615 + }, + { + "auxiliary_loss_clip": 0.01536632, + "auxiliary_loss_mlp": 0.013237, + "balance_loss_clip": 1.1736691, + "balance_loss_mlp": 1.02005005, + "epoch": 0.28160884987675106, + "flos": 17459150971680.0, + "grad_norm": 2.5270373670036728, + "language_loss": 0.87821257, + "learning_rate": 3.372021529749159e-06, + "loss": 0.90681589, + "num_input_tokens_seen": 50222435, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.03125, + "step": 2342, + "time_per_iteration": 3.1057353019714355 + }, + { + "auxiliary_loss_clip": 0.01546392, + "auxiliary_loss_mlp": 0.0132938, + "balance_loss_clip": 1.18273807, + "balance_loss_mlp": 1.02611125, + "epoch": 0.2817290927673901, + "flos": 16836516103680.0, + "grad_norm": 1.9633165286440322, + "language_loss": 0.92530638, + "learning_rate": 3.3714546526327405e-06, + "loss": 0.95406407, + "num_input_tokens_seen": 50240435, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 3.02734375, + "step": 2343, + "time_per_iteration": 3.10500168800354 + }, + { + "auxiliary_loss_clip": 0.01543102, + "auxiliary_loss_mlp": 0.01326974, + "balance_loss_clip": 1.17844844, + "balance_loss_mlp": 1.02485049, + "epoch": 0.2818493356580292, + "flos": 15415902662400.0, + "grad_norm": 2.285380161358849, + "language_loss": 0.88235211, + "learning_rate": 3.3708875674677423e-06, + "loss": 0.91105288, + "num_input_tokens_seen": 50258410, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 3.01757812, + "step": 2344, + "time_per_iteration": 3.009982109069824 + }, + { + "auxiliary_loss_clip": 0.01549753, + "auxiliary_loss_mlp": 0.01344036, + "balance_loss_clip": 1.18633282, + "balance_loss_mlp": 1.04057658, + "epoch": 0.28196957854866833, + "flos": 20414542708800.0, + "grad_norm": 1.8797166145244117, + "language_loss": 0.83838284, + "learning_rate": 3.37032027434019e-06, + "loss": 0.86732078, + "num_input_tokens_seen": 50277930, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 3.02929688, + "step": 2345, + "time_per_iteration": 3.141092300415039 + }, + { + "auxiliary_loss_clip": 0.01545378, + "auxiliary_loss_mlp": 0.01339012, + "balance_loss_clip": 1.18207228, + "balance_loss_mlp": 1.03440905, + "epoch": 0.2820898214393074, + "flos": 19975102675200.0, + "grad_norm": 2.5217218771852563, + "language_loss": 0.83005571, + "learning_rate": 3.369752773336141e-06, + "loss": 0.85889959, + "num_input_tokens_seen": 50297410, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 3.04101562, + "step": 2346, + "time_per_iteration": 3.016315460205078 + }, + { + "auxiliary_loss_clip": 0.01548231, + "auxiliary_loss_mlp": 0.01343893, + "balance_loss_clip": 1.18481219, + "balance_loss_mlp": 1.04119682, + "epoch": 0.2822100643299465, + "flos": 22530500029440.0, + "grad_norm": 2.1953184071070515, + "language_loss": 0.78196549, + "learning_rate": 3.3691850645416864e-06, + "loss": 0.81088674, + "num_input_tokens_seen": 50317120, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.0234375, + "step": 2347, + "time_per_iteration": 2.937962293624878 + }, + { + "auxiliary_loss_clip": 0.01542369, + "auxiliary_loss_mlp": 0.013377, + "balance_loss_clip": 1.18049169, + "balance_loss_mlp": 1.03538513, + "epoch": 0.2823303072205856, + "flos": 11548291641120.0, + "grad_norm": 2.3478965192994488, + "language_loss": 0.83407915, + "learning_rate": 3.368617148042945e-06, + "loss": 0.86287987, + "num_input_tokens_seen": 50334790, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 3.01953125, + "step": 2348, + "time_per_iteration": 2.9840431213378906 + }, + { + "auxiliary_loss_clip": 0.01545999, + "auxiliary_loss_mlp": 0.0133529, + "balance_loss_clip": 1.18269336, + "balance_loss_mlp": 1.03621745, + "epoch": 0.28245055011122466, + "flos": 18261870564960.0, + "grad_norm": 2.3900269225227033, + "language_loss": 0.84503353, + "learning_rate": 3.368049023926071e-06, + "loss": 0.87384641, + "num_input_tokens_seen": 50353785, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.98632812, + "step": 2349, + "time_per_iteration": 2.9419045448303223 + }, + { + "auxiliary_loss_clip": 0.01552396, + "auxiliary_loss_mlp": 0.01324566, + "balance_loss_clip": 1.18859577, + "balance_loss_mlp": 1.02511263, + "epoch": 0.2825707930018638, + "flos": 24610425233760.0, + "grad_norm": 1.870202096043902, + "language_loss": 0.83760238, + "learning_rate": 3.3674806922772476e-06, + "loss": 0.86637199, + "num_input_tokens_seen": 50374670, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.98828125, + "step": 2350, + "time_per_iteration": 3.955232620239258 + }, + { + "auxiliary_loss_clip": 0.01548572, + "auxiliary_loss_mlp": 0.01330744, + "balance_loss_clip": 1.18538606, + "balance_loss_mlp": 1.02804756, + "epoch": 0.28269103589250283, + "flos": 25229267285760.0, + "grad_norm": 1.7484284619482149, + "language_loss": 0.75267684, + "learning_rate": 3.3669121531826904e-06, + "loss": 0.78147, + "num_input_tokens_seen": 50395650, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 3.02148438, + "step": 2351, + "time_per_iteration": 3.024799346923828 + }, + { + "auxiliary_loss_clip": 0.0155127, + "auxiliary_loss_mlp": 0.01338194, + "balance_loss_clip": 1.18675303, + "balance_loss_mlp": 1.03358996, + "epoch": 0.28281127878314194, + "flos": 19283589540000.0, + "grad_norm": 4.541478230822022, + "language_loss": 0.83497286, + "learning_rate": 3.366343406728647e-06, + "loss": 0.86386752, + "num_input_tokens_seen": 50415100, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.04101562, + "step": 2352, + "time_per_iteration": 3.9177820682525635 + }, + { + "auxiliary_loss_clip": 0.01540821, + "auxiliary_loss_mlp": 0.01336187, + "balance_loss_clip": 1.17657256, + "balance_loss_mlp": 1.03635192, + "epoch": 0.28293152167378105, + "flos": 23880680148960.0, + "grad_norm": 1.7411037772418532, + "language_loss": 0.6856184, + "learning_rate": 3.3657744530013946e-06, + "loss": 0.71438849, + "num_input_tokens_seen": 50434335, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.9921875, + "step": 2353, + "time_per_iteration": 3.003201961517334 + }, + { + "auxiliary_loss_clip": 0.01550223, + "auxiliary_loss_mlp": 0.01345407, + "balance_loss_clip": 1.18776178, + "balance_loss_mlp": 1.04213834, + "epoch": 0.2830517645644201, + "flos": 43870044176640.0, + "grad_norm": 3.084731829173679, + "language_loss": 0.7174601, + "learning_rate": 3.3652052920872437e-06, + "loss": 0.74641639, + "num_input_tokens_seen": 50457200, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 3.02734375, + "step": 2354, + "time_per_iteration": 3.1465952396392822 + }, + { + "auxiliary_loss_clip": 0.01548787, + "auxiliary_loss_mlp": 0.01333923, + "balance_loss_clip": 1.18462467, + "balance_loss_mlp": 1.02931976, + "epoch": 0.2831720074550592, + "flos": 26654204537280.0, + "grad_norm": 1.9955903218074857, + "language_loss": 0.85593522, + "learning_rate": 3.3646359240725355e-06, + "loss": 0.88476235, + "num_input_tokens_seen": 50476390, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 3.04101562, + "step": 2355, + "time_per_iteration": 3.036168098449707 + }, + { + "auxiliary_loss_clip": 0.01555348, + "auxiliary_loss_mlp": 0.01355331, + "balance_loss_clip": 1.19015455, + "balance_loss_mlp": 1.05358839, + "epoch": 0.2832922503456983, + "flos": 31032675046080.0, + "grad_norm": 2.427690359951235, + "language_loss": 0.67573667, + "learning_rate": 3.364066349043643e-06, + "loss": 0.7048434, + "num_input_tokens_seen": 50497595, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.01367188, + "step": 2356, + "time_per_iteration": 4.500617742538452 + }, + { + "auxiliary_loss_clip": 0.01543159, + "auxiliary_loss_mlp": 0.01325687, + "balance_loss_clip": 1.17768776, + "balance_loss_mlp": 1.02680576, + "epoch": 0.2834124932363374, + "flos": 20407412214720.0, + "grad_norm": 1.8695713165403607, + "language_loss": 0.82361472, + "learning_rate": 3.363496567086969e-06, + "loss": 0.85230321, + "num_input_tokens_seen": 50514690, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.984375, + "step": 2357, + "time_per_iteration": 3.034802198410034 + }, + { + "auxiliary_loss_clip": 0.01546435, + "auxiliary_loss_mlp": 0.01346132, + "balance_loss_clip": 1.1816808, + "balance_loss_mlp": 1.04286373, + "epoch": 0.2835327361269765, + "flos": 39387990769920.0, + "grad_norm": 1.9561869754715067, + "language_loss": 0.75619048, + "learning_rate": 3.3629265782889506e-06, + "loss": 0.7851162, + "num_input_tokens_seen": 50536515, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.02929688, + "step": 2358, + "time_per_iteration": 3.1085164546966553 + }, + { + "auxiliary_loss_clip": 0.01543742, + "auxiliary_loss_mlp": 0.01346061, + "balance_loss_clip": 1.18093288, + "balance_loss_mlp": 1.0397408, + "epoch": 0.2836529790176156, + "flos": 30264129089280.0, + "grad_norm": 1.9591219927810788, + "language_loss": 0.71808225, + "learning_rate": 3.362356382736054e-06, + "loss": 0.74698031, + "num_input_tokens_seen": 50557120, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 3.05859375, + "step": 2359, + "time_per_iteration": 3.0312139987945557 + }, + { + "auxiliary_loss_clip": 0.01540643, + "auxiliary_loss_mlp": 0.01331368, + "balance_loss_clip": 1.17519712, + "balance_loss_mlp": 1.02905357, + "epoch": 0.28377322190825466, + "flos": 12679472378880.0, + "grad_norm": 2.287674265146889, + "language_loss": 0.91015005, + "learning_rate": 3.361785980514777e-06, + "loss": 0.93887013, + "num_input_tokens_seen": 50573320, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.01953125, + "step": 2360, + "time_per_iteration": 3.0474462509155273 + }, + { + "auxiliary_loss_clip": 0.01541267, + "auxiliary_loss_mlp": 0.01347948, + "balance_loss_clip": 1.17626095, + "balance_loss_mlp": 1.04296303, + "epoch": 0.28389346479889377, + "flos": 18298661244480.0, + "grad_norm": 2.6620505112734842, + "language_loss": 0.76883507, + "learning_rate": 3.361215371711649e-06, + "loss": 0.79772723, + "num_input_tokens_seen": 50592415, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.04492188, + "step": 2361, + "time_per_iteration": 3.0498688220977783 + }, + { + "auxiliary_loss_clip": 0.01542515, + "auxiliary_loss_mlp": 0.01338734, + "balance_loss_clip": 1.1773212, + "balance_loss_mlp": 1.03661025, + "epoch": 0.2840137076895329, + "flos": 20408777628480.0, + "grad_norm": 2.8422160972898003, + "language_loss": 0.83500493, + "learning_rate": 3.3606445564132326e-06, + "loss": 0.86381739, + "num_input_tokens_seen": 50609710, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.015625, + "step": 2362, + "time_per_iteration": 3.007692575454712 + }, + { + "auxiliary_loss_clip": 0.01542659, + "auxiliary_loss_mlp": 0.01347563, + "balance_loss_clip": 1.177912, + "balance_loss_mlp": 1.04276872, + "epoch": 0.28413395058017193, + "flos": 20050125304320.0, + "grad_norm": 2.721219535579286, + "language_loss": 0.82796532, + "learning_rate": 3.360073534706118e-06, + "loss": 0.85686749, + "num_input_tokens_seen": 50626865, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.04492188, + "step": 2363, + "time_per_iteration": 3.077455520629883 + }, + { + "auxiliary_loss_clip": 0.01542178, + "auxiliary_loss_mlp": 0.01331627, + "balance_loss_clip": 1.17698634, + "balance_loss_mlp": 1.02778673, + "epoch": 0.28425419347081105, + "flos": 37666300680000.0, + "grad_norm": 2.0511610605561708, + "language_loss": 0.75988829, + "learning_rate": 3.35950230667693e-06, + "loss": 0.78862631, + "num_input_tokens_seen": 50648560, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 3.03320312, + "step": 2364, + "time_per_iteration": 3.1089894771575928 + }, + { + "auxiliary_loss_clip": 0.01540205, + "auxiliary_loss_mlp": 0.01334867, + "balance_loss_clip": 1.17563605, + "balance_loss_mlp": 1.03522265, + "epoch": 0.28437443636145016, + "flos": 13846799017440.0, + "grad_norm": 3.141383869845208, + "language_loss": 0.85894406, + "learning_rate": 3.358930872412323e-06, + "loss": 0.88769472, + "num_input_tokens_seen": 50665725, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.99023438, + "step": 2365, + "time_per_iteration": 2.924778699874878 + }, + { + "auxiliary_loss_clip": 0.0153925, + "auxiliary_loss_mlp": 0.01325114, + "balance_loss_clip": 1.17526269, + "balance_loss_mlp": 1.02184606, + "epoch": 0.2844946792520892, + "flos": 22750030405440.0, + "grad_norm": 1.720190107006035, + "language_loss": 0.80766863, + "learning_rate": 3.3583592319989825e-06, + "loss": 0.83631229, + "num_input_tokens_seen": 50685095, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.02734375, + "step": 2366, + "time_per_iteration": 2.966684103012085 + }, + { + "auxiliary_loss_clip": 0.01548477, + "auxiliary_loss_mlp": 0.01339814, + "balance_loss_clip": 1.18289948, + "balance_loss_mlp": 1.03521049, + "epoch": 0.2846149221427283, + "flos": 32418735569280.0, + "grad_norm": 1.9675379875969066, + "language_loss": 0.68640602, + "learning_rate": 3.357787385523627e-06, + "loss": 0.71528888, + "num_input_tokens_seen": 50706500, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.04101562, + "step": 2367, + "time_per_iteration": 3.0135812759399414 + }, + { + "auxiliary_loss_clip": 0.01539809, + "auxiliary_loss_mlp": 0.01339168, + "balance_loss_clip": 1.17457771, + "balance_loss_mlp": 1.03666246, + "epoch": 0.2847351650333674, + "flos": 28478529321120.0, + "grad_norm": 2.0501326438578147, + "language_loss": 0.82880616, + "learning_rate": 3.3572153330730048e-06, + "loss": 0.85759592, + "num_input_tokens_seen": 50727595, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.01953125, + "step": 2368, + "time_per_iteration": 3.020465612411499 + }, + { + "auxiliary_loss_clip": 0.0162148, + "auxiliary_loss_mlp": 0.01244766, + "balance_loss_clip": 1.24882674, + "balance_loss_mlp": 1.00444031, + "epoch": 0.2848554079240065, + "flos": 55758744332640.0, + "grad_norm": 0.83050533545342, + "language_loss": 0.64606106, + "learning_rate": 3.3566430747338956e-06, + "loss": 0.67472351, + "num_input_tokens_seen": 50782800, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 2.3984375, + "step": 2369, + "time_per_iteration": 3.2685694694519043 + }, + { + "auxiliary_loss_clip": 0.01535436, + "auxiliary_loss_mlp": 0.01324712, + "balance_loss_clip": 1.16940129, + "balance_loss_mlp": 1.02525866, + "epoch": 0.2849756508146456, + "flos": 11838407051520.0, + "grad_norm": 21.44220932815887, + "language_loss": 0.86449182, + "learning_rate": 3.35607061059311e-06, + "loss": 0.89309329, + "num_input_tokens_seen": 50797730, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.99023438, + "step": 2370, + "time_per_iteration": 2.9661529064178467 + }, + { + "auxiliary_loss_clip": 0.01542334, + "auxiliary_loss_mlp": 0.01337996, + "balance_loss_clip": 1.17485511, + "balance_loss_mlp": 1.03530025, + "epoch": 0.28509589370528465, + "flos": 25157544406560.0, + "grad_norm": 2.9573272342279977, + "language_loss": 0.75031126, + "learning_rate": 3.3554979407374917e-06, + "loss": 0.77911454, + "num_input_tokens_seen": 50819840, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.02148438, + "step": 2371, + "time_per_iteration": 3.0345771312713623 + }, + { + "auxiliary_loss_clip": 0.01530554, + "auxiliary_loss_mlp": 0.01330368, + "balance_loss_clip": 1.16455126, + "balance_loss_mlp": 1.03053248, + "epoch": 0.28521613659592376, + "flos": 19976885298720.0, + "grad_norm": 1.648641305395657, + "language_loss": 0.73653895, + "learning_rate": 3.3549250652539134e-06, + "loss": 0.76514816, + "num_input_tokens_seen": 50838935, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.99414062, + "step": 2372, + "time_per_iteration": 2.9305779933929443 + }, + { + "auxiliary_loss_clip": 0.01530929, + "auxiliary_loss_mlp": 0.01327843, + "balance_loss_clip": 1.16496384, + "balance_loss_mlp": 1.02495646, + "epoch": 0.2853363794865629, + "flos": 23370237871200.0, + "grad_norm": 2.789172458669849, + "language_loss": 0.81650048, + "learning_rate": 3.3543519842292794e-06, + "loss": 0.84508824, + "num_input_tokens_seen": 50858590, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 3.0234375, + "step": 2373, + "time_per_iteration": 3.035419225692749 + }, + { + "auxiliary_loss_clip": 0.0153305, + "auxiliary_loss_mlp": 0.01330008, + "balance_loss_clip": 1.16602182, + "balance_loss_mlp": 1.02826524, + "epoch": 0.28545662237720193, + "flos": 19863820360800.0, + "grad_norm": 1.9624686839481351, + "language_loss": 0.83556616, + "learning_rate": 3.353778697750527e-06, + "loss": 0.86419672, + "num_input_tokens_seen": 50876995, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.01367188, + "step": 2374, + "time_per_iteration": 2.8886873722076416 + }, + { + "auxiliary_loss_clip": 0.01532669, + "auxiliary_loss_mlp": 0.01330125, + "balance_loss_clip": 1.16523576, + "balance_loss_mlp": 1.02800107, + "epoch": 0.28557686526784104, + "flos": 23881324927680.0, + "grad_norm": 1.7981056810011737, + "language_loss": 0.89465797, + "learning_rate": 3.353205205904622e-06, + "loss": 0.9232859, + "num_input_tokens_seen": 50896105, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 3.01757812, + "step": 2375, + "time_per_iteration": 3.042318344116211 + }, + { + "auxiliary_loss_clip": 0.01529653, + "auxiliary_loss_mlp": 0.01332881, + "balance_loss_clip": 1.16375804, + "balance_loss_mlp": 1.03304601, + "epoch": 0.28569710815848015, + "flos": 44893621631520.0, + "grad_norm": 3.8299763553676027, + "language_loss": 0.7169385, + "learning_rate": 3.3526315087785637e-06, + "loss": 0.74556381, + "num_input_tokens_seen": 50917220, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.9921875, + "step": 2376, + "time_per_iteration": 3.1605496406555176 + }, + { + "auxiliary_loss_clip": 0.01531378, + "auxiliary_loss_mlp": 0.01331032, + "balance_loss_clip": 1.16602898, + "balance_loss_mlp": 1.02757311, + "epoch": 0.2858173510491192, + "flos": 26831975644800.0, + "grad_norm": 1.7876657151429178, + "language_loss": 0.81057572, + "learning_rate": 3.3520576064593805e-06, + "loss": 0.83919978, + "num_input_tokens_seen": 50937175, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.02929688, + "step": 2377, + "time_per_iteration": 3.742558479309082 + }, + { + "auxiliary_loss_clip": 0.01533768, + "auxiliary_loss_mlp": 0.0133685, + "balance_loss_clip": 1.16717029, + "balance_loss_mlp": 1.03510749, + "epoch": 0.2859375939397583, + "flos": 23151162633120.0, + "grad_norm": 1.5960501058975167, + "language_loss": 0.82128841, + "learning_rate": 3.3514834990341337e-06, + "loss": 0.8499946, + "num_input_tokens_seen": 50957500, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.01367188, + "step": 2378, + "time_per_iteration": 3.8362793922424316 + }, + { + "auxiliary_loss_clip": 0.0152744, + "auxiliary_loss_mlp": 0.0133587, + "balance_loss_clip": 1.16224456, + "balance_loss_mlp": 1.03832364, + "epoch": 0.2860578368303974, + "flos": 12131405002080.0, + "grad_norm": 10.166596541922425, + "language_loss": 0.9337393, + "learning_rate": 3.3509091865899144e-06, + "loss": 0.96237236, + "num_input_tokens_seen": 50972690, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.97265625, + "step": 2379, + "time_per_iteration": 2.952556848526001 + }, + { + "auxiliary_loss_clip": 0.01533708, + "auxiliary_loss_mlp": 0.01331916, + "balance_loss_clip": 1.16732419, + "balance_loss_mlp": 1.03131795, + "epoch": 0.2861780797210365, + "flos": 19940094619200.0, + "grad_norm": 3.433857677647786, + "language_loss": 0.70472056, + "learning_rate": 3.350334669213846e-06, + "loss": 0.7333768, + "num_input_tokens_seen": 50990095, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 3.00390625, + "step": 2380, + "time_per_iteration": 2.915388822555542 + }, + { + "auxiliary_loss_clip": 0.01534228, + "auxiliary_loss_mlp": 0.01330197, + "balance_loss_clip": 1.16730571, + "balance_loss_mlp": 1.02807295, + "epoch": 0.2862983226116756, + "flos": 27565589401920.0, + "grad_norm": 2.8939787895171647, + "language_loss": 0.75593847, + "learning_rate": 3.3497599469930816e-06, + "loss": 0.78458273, + "num_input_tokens_seen": 51008305, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 3.015625, + "step": 2381, + "time_per_iteration": 3.0616908073425293 + }, + { + "auxiliary_loss_clip": 0.01526902, + "auxiliary_loss_mlp": 0.01335118, + "balance_loss_clip": 1.16109896, + "balance_loss_mlp": 1.03223109, + "epoch": 0.28641856550231465, + "flos": 22056127796160.0, + "grad_norm": 2.226938210005908, + "language_loss": 0.83186376, + "learning_rate": 3.349185020014807e-06, + "loss": 0.86048394, + "num_input_tokens_seen": 51025570, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.02734375, + "step": 2382, + "time_per_iteration": 2.914696216583252 + }, + { + "auxiliary_loss_clip": 0.01531099, + "auxiliary_loss_mlp": 0.01327374, + "balance_loss_clip": 1.16389537, + "balance_loss_mlp": 1.0260129, + "epoch": 0.28653880839295376, + "flos": 22380758196480.0, + "grad_norm": 2.0797150848810744, + "language_loss": 0.74471956, + "learning_rate": 3.348609888366237e-06, + "loss": 0.77330434, + "num_input_tokens_seen": 51044585, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.00976562, + "step": 2383, + "time_per_iteration": 2.9865193367004395 + }, + { + "auxiliary_loss_clip": 0.01528883, + "auxiliary_loss_mlp": 0.01336765, + "balance_loss_clip": 1.16387641, + "balance_loss_mlp": 1.03502202, + "epoch": 0.28665905128359287, + "flos": 23370237871200.0, + "grad_norm": 2.310617426421161, + "language_loss": 0.63104475, + "learning_rate": 3.348034552134619e-06, + "loss": 0.65970123, + "num_input_tokens_seen": 51063990, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 3.01367188, + "step": 2384, + "time_per_iteration": 4.751616954803467 + }, + { + "auxiliary_loss_clip": 0.01529993, + "auxiliary_loss_mlp": 0.01317276, + "balance_loss_clip": 1.16390967, + "balance_loss_mlp": 1.01934767, + "epoch": 0.2867792941742319, + "flos": 20883529143360.0, + "grad_norm": 2.449468878153713, + "language_loss": 0.84558105, + "learning_rate": 3.3474590114072316e-06, + "loss": 0.87405372, + "num_input_tokens_seen": 51081990, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.9765625, + "step": 2385, + "time_per_iteration": 2.99090313911438 + }, + { + "auxiliary_loss_clip": 0.01530348, + "auxiliary_loss_mlp": 0.01328941, + "balance_loss_clip": 1.164446, + "balance_loss_mlp": 1.02834296, + "epoch": 0.28689953706487104, + "flos": 20665895175360.0, + "grad_norm": 2.0515056912211413, + "language_loss": 0.83054411, + "learning_rate": 3.3468832662713836e-06, + "loss": 0.85913706, + "num_input_tokens_seen": 51100235, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.00195312, + "step": 2386, + "time_per_iteration": 3.2072510719299316 + }, + { + "auxiliary_loss_clip": 0.01524259, + "auxiliary_loss_mlp": 0.01329097, + "balance_loss_clip": 1.15718448, + "balance_loss_mlp": 1.02907145, + "epoch": 0.28701977995551015, + "flos": 12677082904800.0, + "grad_norm": 2.599025669083831, + "language_loss": 0.83725667, + "learning_rate": 3.346307316814415e-06, + "loss": 0.86579025, + "num_input_tokens_seen": 51115405, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.99609375, + "step": 2387, + "time_per_iteration": 2.985201835632324 + }, + { + "auxiliary_loss_clip": 0.01534859, + "auxiliary_loss_mlp": 0.01336246, + "balance_loss_clip": 1.16784799, + "balance_loss_mlp": 1.03393102, + "epoch": 0.2871400228461492, + "flos": 21254546047680.0, + "grad_norm": 2.615518043798803, + "language_loss": 0.75835639, + "learning_rate": 3.3457311631236965e-06, + "loss": 0.78706741, + "num_input_tokens_seen": 51136390, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 3.01757812, + "step": 2388, + "time_per_iteration": 2.9512481689453125 + }, + { + "auxiliary_loss_clip": 0.01525069, + "auxiliary_loss_mlp": 0.01331082, + "balance_loss_clip": 1.1586566, + "balance_loss_mlp": 1.02686012, + "epoch": 0.2872602657367883, + "flos": 25121626074720.0, + "grad_norm": 2.0113895287161205, + "language_loss": 0.8468135, + "learning_rate": 3.345154805286631e-06, + "loss": 0.87537503, + "num_input_tokens_seen": 51156650, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 3.03710938, + "step": 2389, + "time_per_iteration": 2.967174530029297 + }, + { + "auxiliary_loss_clip": 0.01529813, + "auxiliary_loss_mlp": 0.01344088, + "balance_loss_clip": 1.16378367, + "balance_loss_mlp": 1.04024768, + "epoch": 0.2873805086274274, + "flos": 16648049255040.0, + "grad_norm": 2.851089279627925, + "language_loss": 0.76577187, + "learning_rate": 3.344578243390651e-06, + "loss": 0.79451084, + "num_input_tokens_seen": 51172210, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 3.03320312, + "step": 2390, + "time_per_iteration": 2.930936574935913 + }, + { + "auxiliary_loss_clip": 0.01528564, + "auxiliary_loss_mlp": 0.01330439, + "balance_loss_clip": 1.16106236, + "balance_loss_mlp": 1.02984095, + "epoch": 0.2875007515180665, + "flos": 17422018938720.0, + "grad_norm": 2.6803220704400297, + "language_loss": 0.78299284, + "learning_rate": 3.3440014775232206e-06, + "loss": 0.81158292, + "num_input_tokens_seen": 51190265, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 3.00390625, + "step": 2391, + "time_per_iteration": 2.9851837158203125 + }, + { + "auxiliary_loss_clip": 0.01530582, + "auxiliary_loss_mlp": 0.01320926, + "balance_loss_clip": 1.16344368, + "balance_loss_mlp": 1.02147174, + "epoch": 0.2876209944087056, + "flos": 23436081885600.0, + "grad_norm": 2.329053217801696, + "language_loss": 0.71247613, + "learning_rate": 3.343424507771834e-06, + "loss": 0.74099123, + "num_input_tokens_seen": 51208475, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.99023438, + "step": 2392, + "time_per_iteration": 3.05267071723938 + }, + { + "auxiliary_loss_clip": 0.01525928, + "auxiliary_loss_mlp": 0.01331579, + "balance_loss_clip": 1.15938187, + "balance_loss_mlp": 1.03326952, + "epoch": 0.2877412372993447, + "flos": 13737223470240.0, + "grad_norm": 1.8158255403656882, + "language_loss": 0.86525786, + "learning_rate": 3.342847334224018e-06, + "loss": 0.89383292, + "num_input_tokens_seen": 51225875, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.9765625, + "step": 2393, + "time_per_iteration": 2.961097002029419 + }, + { + "auxiliary_loss_clip": 0.01628209, + "auxiliary_loss_mlp": 0.01223969, + "balance_loss_clip": 1.25483441, + "balance_loss_mlp": 0.9859314, + "epoch": 0.28786148018998375, + "flos": 58086721889280.0, + "grad_norm": 0.9454649764218154, + "language_loss": 0.62324297, + "learning_rate": 3.342269956967329e-06, + "loss": 0.65176463, + "num_input_tokens_seen": 51287780, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 2.375, + "step": 2394, + "time_per_iteration": 3.5398635864257812 + }, + { + "auxiliary_loss_clip": 0.01527996, + "auxiliary_loss_mlp": 0.01324795, + "balance_loss_clip": 1.16072237, + "balance_loss_mlp": 1.02248001, + "epoch": 0.28798172308062286, + "flos": 23436992161440.0, + "grad_norm": 2.8106017117745252, + "language_loss": 0.7214514, + "learning_rate": 3.341692376089355e-06, + "loss": 0.74997926, + "num_input_tokens_seen": 51303335, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 3.01953125, + "step": 2395, + "time_per_iteration": 2.9887149333953857 + }, + { + "auxiliary_loss_clip": 0.01525829, + "auxiliary_loss_mlp": 0.01336689, + "balance_loss_clip": 1.16009712, + "balance_loss_mlp": 1.03475618, + "epoch": 0.288101965971262, + "flos": 25111878537600.0, + "grad_norm": 3.231332930864413, + "language_loss": 0.83932078, + "learning_rate": 3.3411145916777146e-06, + "loss": 0.86794591, + "num_input_tokens_seen": 51317495, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.01367188, + "step": 2396, + "time_per_iteration": 2.9442853927612305 + }, + { + "auxiliary_loss_clip": 0.01532694, + "auxiliary_loss_mlp": 0.01346624, + "balance_loss_clip": 1.16728795, + "balance_loss_mlp": 1.0444994, + "epoch": 0.28822220886190103, + "flos": 16254388874880.0, + "grad_norm": 3.62525143468791, + "language_loss": 0.91115713, + "learning_rate": 3.3405366038200566e-06, + "loss": 0.93995035, + "num_input_tokens_seen": 51336430, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 3.01757812, + "step": 2397, + "time_per_iteration": 3.004537343978882 + }, + { + "auxiliary_loss_clip": 0.01538333, + "auxiliary_loss_mlp": 0.01341719, + "balance_loss_clip": 1.17402399, + "balance_loss_mlp": 1.03997636, + "epoch": 0.28834245175254014, + "flos": 24537981719520.0, + "grad_norm": 2.6989691565579226, + "language_loss": 0.85419917, + "learning_rate": 3.3399584126040617e-06, + "loss": 0.88299978, + "num_input_tokens_seen": 51355930, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 3.01171875, + "step": 2398, + "time_per_iteration": 3.0810184478759766 + }, + { + "auxiliary_loss_clip": 0.01527278, + "auxiliary_loss_mlp": 0.01338295, + "balance_loss_clip": 1.16173077, + "balance_loss_mlp": 1.0369339, + "epoch": 0.2884626946431792, + "flos": 24573748338720.0, + "grad_norm": 2.2209676990437948, + "language_loss": 0.90793788, + "learning_rate": 3.339380018117441e-06, + "loss": 0.93659365, + "num_input_tokens_seen": 51376765, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.0078125, + "step": 2399, + "time_per_iteration": 2.9653584957122803 + }, + { + "auxiliary_loss_clip": 0.01533027, + "auxiliary_loss_mlp": 0.0133523, + "balance_loss_clip": 1.16734231, + "balance_loss_mlp": 1.03501284, + "epoch": 0.2885829375338183, + "flos": 16546514477760.0, + "grad_norm": 3.075935602162582, + "language_loss": 0.78409296, + "learning_rate": 3.3388014204479366e-06, + "loss": 0.81277555, + "num_input_tokens_seen": 51394570, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.99609375, + "step": 2400, + "time_per_iteration": 3.11972975730896 + }, + { + "auxiliary_loss_clip": 0.01530698, + "auxiliary_loss_mlp": 0.01339499, + "balance_loss_clip": 1.16611195, + "balance_loss_mlp": 1.03451383, + "epoch": 0.2887031804244574, + "flos": 24063533629920.0, + "grad_norm": 2.4674100940462242, + "language_loss": 0.91654181, + "learning_rate": 3.338222619683321e-06, + "loss": 0.94524384, + "num_input_tokens_seen": 51414535, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 3.04492188, + "step": 2401, + "time_per_iteration": 2.9476354122161865 + }, + { + "auxiliary_loss_clip": 0.01524803, + "auxiliary_loss_mlp": 0.01334735, + "balance_loss_clip": 1.16108394, + "balance_loss_mlp": 1.03184795, + "epoch": 0.2888234233150965, + "flos": 23332916197440.0, + "grad_norm": 2.346452921231167, + "language_loss": 0.73182476, + "learning_rate": 3.337643615911398e-06, + "loss": 0.7604202, + "num_input_tokens_seen": 51434160, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 3.0234375, + "step": 2402, + "time_per_iteration": 2.9531898498535156 + }, + { + "auxiliary_loss_clip": 0.01528392, + "auxiliary_loss_mlp": 0.01326369, + "balance_loss_clip": 1.16129971, + "balance_loss_mlp": 1.02634358, + "epoch": 0.2889436662057356, + "flos": 22274937537120.0, + "grad_norm": 2.0768690615683556, + "language_loss": 0.79082894, + "learning_rate": 3.3370644092200026e-06, + "loss": 0.81937659, + "num_input_tokens_seen": 51451435, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.99609375, + "step": 2403, + "time_per_iteration": 2.9960336685180664 + }, + { + "auxiliary_loss_clip": 0.01528786, + "auxiliary_loss_mlp": 0.01331195, + "balance_loss_clip": 1.16235483, + "balance_loss_mlp": 1.02964365, + "epoch": 0.2890639090963747, + "flos": 21619115164800.0, + "grad_norm": 1.7782054838629975, + "language_loss": 0.78174436, + "learning_rate": 3.3364849996969985e-06, + "loss": 0.81034416, + "num_input_tokens_seen": 51471455, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 3.00976562, + "step": 2404, + "time_per_iteration": 2.94299054145813 + }, + { + "auxiliary_loss_clip": 0.01526916, + "auxiliary_loss_mlp": 0.0133802, + "balance_loss_clip": 1.16240764, + "balance_loss_mlp": 1.03913879, + "epoch": 0.28918415198701375, + "flos": 28588104868320.0, + "grad_norm": 2.64627418158877, + "language_loss": 0.85350001, + "learning_rate": 3.335905387430283e-06, + "loss": 0.8821494, + "num_input_tokens_seen": 51492890, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.984375, + "step": 2405, + "time_per_iteration": 3.833845376968384 + }, + { + "auxiliary_loss_clip": 0.01525575, + "auxiliary_loss_mlp": 0.01330616, + "balance_loss_clip": 1.1611104, + "balance_loss_mlp": 1.03059006, + "epoch": 0.28930439487765286, + "flos": 21946855674240.0, + "grad_norm": 1.9672504914138462, + "language_loss": 0.83098525, + "learning_rate": 3.335325572507782e-06, + "loss": 0.85954714, + "num_input_tokens_seen": 51513390, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.99804688, + "step": 2406, + "time_per_iteration": 3.9280340671539307 + }, + { + "auxiliary_loss_clip": 0.01542756, + "auxiliary_loss_mlp": 0.01337752, + "balance_loss_clip": 1.1770786, + "balance_loss_mlp": 1.03467441, + "epoch": 0.28942463776829197, + "flos": 19283968821600.0, + "grad_norm": 1.9663286863505303, + "language_loss": 0.73780054, + "learning_rate": 3.3347455550174537e-06, + "loss": 0.76660562, + "num_input_tokens_seen": 51532730, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 3.02734375, + "step": 2407, + "time_per_iteration": 2.9473719596862793 + }, + { + "auxiliary_loss_clip": 0.0152116, + "auxiliary_loss_mlp": 0.01332307, + "balance_loss_clip": 1.15606177, + "balance_loss_mlp": 1.03266263, + "epoch": 0.289544880658931, + "flos": 14647356705600.0, + "grad_norm": 2.1981431828452527, + "language_loss": 0.68196011, + "learning_rate": 3.3341653350472864e-06, + "loss": 0.71049482, + "num_input_tokens_seen": 51549560, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.99414062, + "step": 2408, + "time_per_iteration": 3.0181589126586914 + }, + { + "auxiliary_loss_clip": 0.0152276, + "auxiliary_loss_mlp": 0.01347529, + "balance_loss_clip": 1.16033578, + "balance_loss_mlp": 1.03834796, + "epoch": 0.28966512354957014, + "flos": 28624440409920.0, + "grad_norm": 3.263330479625285, + "language_loss": 0.69023722, + "learning_rate": 3.333584912685298e-06, + "loss": 0.71894014, + "num_input_tokens_seen": 51568180, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 3.08789062, + "step": 2409, + "time_per_iteration": 3.0891001224517822 + }, + { + "auxiliary_loss_clip": 0.01624931, + "auxiliary_loss_mlp": 0.01294868, + "balance_loss_clip": 1.25310397, + "balance_loss_mlp": 1.05149078, + "epoch": 0.28978536644020925, + "flos": 64718716612320.0, + "grad_norm": 0.8893715380981932, + "language_loss": 0.55536407, + "learning_rate": 3.3330042880195385e-06, + "loss": 0.58456206, + "num_input_tokens_seen": 51622530, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 2.4296875, + "step": 2410, + "time_per_iteration": 3.5127856731414795 + }, + { + "auxiliary_loss_clip": 0.01522503, + "auxiliary_loss_mlp": 0.01326979, + "balance_loss_clip": 1.15936005, + "balance_loss_mlp": 1.0261898, + "epoch": 0.2899056093308483, + "flos": 18626629322880.0, + "grad_norm": 2.2062969802034105, + "language_loss": 0.78724772, + "learning_rate": 3.3324234611380888e-06, + "loss": 0.81574255, + "num_input_tokens_seen": 51641260, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 3.00390625, + "step": 2411, + "time_per_iteration": 3.9087584018707275 + }, + { + "auxiliary_loss_clip": 0.01536711, + "auxiliary_loss_mlp": 0.01328742, + "balance_loss_clip": 1.17246068, + "balance_loss_mlp": 1.02871633, + "epoch": 0.2900258522214874, + "flos": 22895979422400.0, + "grad_norm": 1.736888346537513, + "language_loss": 0.8165617, + "learning_rate": 3.3318424321290596e-06, + "loss": 0.84521633, + "num_input_tokens_seen": 51660975, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.99414062, + "step": 2412, + "time_per_iteration": 3.887099266052246 + }, + { + "auxiliary_loss_clip": 0.01608977, + "auxiliary_loss_mlp": 0.01227333, + "balance_loss_clip": 1.23835087, + "balance_loss_mlp": 0.99082184, + "epoch": 0.2901460951121265, + "flos": 71111799669600.0, + "grad_norm": 0.9183490901449225, + "language_loss": 0.59951508, + "learning_rate": 3.3312612010805917e-06, + "loss": 0.62787819, + "num_input_tokens_seen": 51720550, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 2.359375, + "step": 2413, + "time_per_iteration": 3.5261502265930176 + }, + { + "auxiliary_loss_clip": 0.01518671, + "auxiliary_loss_mlp": 0.01338166, + "balance_loss_clip": 1.15560544, + "balance_loss_mlp": 1.03775859, + "epoch": 0.2902663380027656, + "flos": 32163741999360.0, + "grad_norm": 1.923121846480491, + "language_loss": 0.70386243, + "learning_rate": 3.330679768080858e-06, + "loss": 0.73243082, + "num_input_tokens_seen": 51744435, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.99804688, + "step": 2414, + "time_per_iteration": 3.1100475788116455 + }, + { + "auxiliary_loss_clip": 0.01525546, + "auxiliary_loss_mlp": 0.01342376, + "balance_loss_clip": 1.16356671, + "balance_loss_mlp": 1.04349482, + "epoch": 0.2903865808934047, + "flos": 29354261351040.0, + "grad_norm": 2.432346736079944, + "language_loss": 0.83690512, + "learning_rate": 3.3300981332180627e-06, + "loss": 0.86558431, + "num_input_tokens_seen": 51763640, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.984375, + "step": 2415, + "time_per_iteration": 3.0807197093963623 + }, + { + "auxiliary_loss_clip": 0.01513113, + "auxiliary_loss_mlp": 0.01337009, + "balance_loss_clip": 1.15035474, + "balance_loss_mlp": 1.03984451, + "epoch": 0.29050682378404374, + "flos": 17090751110400.0, + "grad_norm": 1.847828753797543, + "language_loss": 0.79989445, + "learning_rate": 3.3295162965804373e-06, + "loss": 0.82839566, + "num_input_tokens_seen": 51782135, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.96875, + "step": 2416, + "time_per_iteration": 3.0041847229003906 + }, + { + "auxiliary_loss_clip": 0.01516413, + "auxiliary_loss_mlp": 0.01332324, + "balance_loss_clip": 1.15268159, + "balance_loss_mlp": 1.03954577, + "epoch": 0.29062706667468285, + "flos": 17860169414880.0, + "grad_norm": 2.1383114186942564, + "language_loss": 0.78681797, + "learning_rate": 3.328934258256247e-06, + "loss": 0.81530535, + "num_input_tokens_seen": 51800200, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.9296875, + "step": 2417, + "time_per_iteration": 3.030395269393921 + }, + { + "auxiliary_loss_clip": 0.01509178, + "auxiliary_loss_mlp": 0.01328678, + "balance_loss_clip": 1.14468896, + "balance_loss_mlp": 1.02712584, + "epoch": 0.29074730956532197, + "flos": 24282115801920.0, + "grad_norm": 2.114298538689679, + "language_loss": 0.67279994, + "learning_rate": 3.3283520183337856e-06, + "loss": 0.70117843, + "num_input_tokens_seen": 51819905, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.01171875, + "step": 2418, + "time_per_iteration": 3.0362212657928467 + }, + { + "auxiliary_loss_clip": 0.01506851, + "auxiliary_loss_mlp": 0.01325747, + "balance_loss_clip": 1.14387548, + "balance_loss_mlp": 1.02724683, + "epoch": 0.290867552455961, + "flos": 22342867600320.0, + "grad_norm": 2.0420989223220904, + "language_loss": 0.68978393, + "learning_rate": 3.3277695769013797e-06, + "loss": 0.71810997, + "num_input_tokens_seen": 51839350, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.98046875, + "step": 2419, + "time_per_iteration": 2.969766855239868 + }, + { + "auxiliary_loss_clip": 0.01516103, + "auxiliary_loss_mlp": 0.0133431, + "balance_loss_clip": 1.15301061, + "balance_loss_mlp": 1.03256726, + "epoch": 0.29098779534660013, + "flos": 23188446378720.0, + "grad_norm": 2.959943027659718, + "language_loss": 0.77373624, + "learning_rate": 3.327186934047385e-06, + "loss": 0.80224037, + "num_input_tokens_seen": 51858045, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.01367188, + "step": 2420, + "time_per_iteration": 3.036864757537842 + }, + { + "auxiliary_loss_clip": 0.01512814, + "auxiliary_loss_mlp": 0.01329007, + "balance_loss_clip": 1.14917099, + "balance_loss_mlp": 1.03126955, + "epoch": 0.29110803823723924, + "flos": 15305985761760.0, + "grad_norm": 3.5947407622758196, + "language_loss": 0.65654063, + "learning_rate": 3.3266040898601877e-06, + "loss": 0.68495888, + "num_input_tokens_seen": 51875880, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.97460938, + "step": 2421, + "time_per_iteration": 2.9965429306030273 + }, + { + "auxiliary_loss_clip": 0.0151405, + "auxiliary_loss_mlp": 0.01333106, + "balance_loss_clip": 1.15256727, + "balance_loss_mlp": 1.03651357, + "epoch": 0.2912282811278783, + "flos": 22597216391520.0, + "grad_norm": 2.219780624481885, + "language_loss": 0.78181303, + "learning_rate": 3.3260210444282045e-06, + "loss": 0.81028461, + "num_input_tokens_seen": 51893835, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.96289062, + "step": 2422, + "time_per_iteration": 3.1435065269470215 + }, + { + "auxiliary_loss_clip": 0.0150765, + "auxiliary_loss_mlp": 0.0133202, + "balance_loss_clip": 1.14349449, + "balance_loss_mlp": 1.03123093, + "epoch": 0.2913485240185174, + "flos": 24500166979680.0, + "grad_norm": 4.273827590851698, + "language_loss": 0.73446596, + "learning_rate": 3.325437797839883e-06, + "loss": 0.76286256, + "num_input_tokens_seen": 51912205, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 3.00195312, + "step": 2423, + "time_per_iteration": 3.0564122200012207 + }, + { + "auxiliary_loss_clip": 0.0150895, + "auxiliary_loss_mlp": 0.01339101, + "balance_loss_clip": 1.1459744, + "balance_loss_mlp": 1.03793097, + "epoch": 0.2914687669091565, + "flos": 17932954282560.0, + "grad_norm": 2.6518604688414475, + "language_loss": 0.7560429, + "learning_rate": 3.3248543501837015e-06, + "loss": 0.78452337, + "num_input_tokens_seen": 51929410, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 3.00585938, + "step": 2424, + "time_per_iteration": 2.9631314277648926 + }, + { + "auxiliary_loss_clip": 0.01510429, + "auxiliary_loss_mlp": 0.01338424, + "balance_loss_clip": 1.14698923, + "balance_loss_mlp": 1.03877926, + "epoch": 0.2915890097997956, + "flos": 22531562017920.0, + "grad_norm": 1.913625784618738, + "language_loss": 0.7721808, + "learning_rate": 3.3242707015481684e-06, + "loss": 0.80066925, + "num_input_tokens_seen": 51949345, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.9921875, + "step": 2425, + "time_per_iteration": 3.0125997066497803 + }, + { + "auxiliary_loss_clip": 0.01507679, + "auxiliary_loss_mlp": 0.01326463, + "balance_loss_clip": 1.14450431, + "balance_loss_mlp": 1.02929771, + "epoch": 0.2917092526904347, + "flos": 13846950730080.0, + "grad_norm": 2.388154240318181, + "language_loss": 0.80848074, + "learning_rate": 3.323686852021823e-06, + "loss": 0.83682215, + "num_input_tokens_seen": 51966855, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.96679688, + "step": 2426, + "time_per_iteration": 2.9446463584899902 + }, + { + "auxiliary_loss_clip": 0.01507386, + "auxiliary_loss_mlp": 0.01336242, + "balance_loss_clip": 1.14491045, + "balance_loss_mlp": 1.03678775, + "epoch": 0.2918294955810738, + "flos": 22677093825120.0, + "grad_norm": 7.65716803761427, + "language_loss": 0.79944897, + "learning_rate": 3.323102801693235e-06, + "loss": 0.82788527, + "num_input_tokens_seen": 51985620, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.99023438, + "step": 2427, + "time_per_iteration": 2.9996492862701416 + }, + { + "auxiliary_loss_clip": 0.01507221, + "auxiliary_loss_mlp": 0.01317672, + "balance_loss_clip": 1.14347899, + "balance_loss_mlp": 1.0210793, + "epoch": 0.29194973847171285, + "flos": 23440405695840.0, + "grad_norm": 2.7258525748092652, + "language_loss": 0.80816168, + "learning_rate": 3.322518550651003e-06, + "loss": 0.83641064, + "num_input_tokens_seen": 52004930, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.9609375, + "step": 2428, + "time_per_iteration": 3.110562801361084 + }, + { + "auxiliary_loss_clip": 0.01503336, + "auxiliary_loss_mlp": 0.01338983, + "balance_loss_clip": 1.14049959, + "balance_loss_mlp": 1.03800356, + "epoch": 0.29206998136235196, + "flos": 21911278695840.0, + "grad_norm": 2.205750617418984, + "language_loss": 0.81623375, + "learning_rate": 3.3219340989837586e-06, + "loss": 0.84465694, + "num_input_tokens_seen": 52024920, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 3.00585938, + "step": 2429, + "time_per_iteration": 2.994018077850342 + }, + { + "auxiliary_loss_clip": 0.01505234, + "auxiliary_loss_mlp": 0.01332917, + "balance_loss_clip": 1.14426816, + "balance_loss_mlp": 1.03250921, + "epoch": 0.292190224252991, + "flos": 23217841067040.0, + "grad_norm": 1.9939376487005105, + "language_loss": 0.80696404, + "learning_rate": 3.3213494467801625e-06, + "loss": 0.83534551, + "num_input_tokens_seen": 52044095, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 3.0, + "step": 2430, + "time_per_iteration": 3.090907573699951 + }, + { + "auxiliary_loss_clip": 0.01507787, + "auxiliary_loss_mlp": 0.0132192, + "balance_loss_clip": 1.14629841, + "balance_loss_mlp": 1.02246618, + "epoch": 0.2923104671436301, + "flos": 20742927996960.0, + "grad_norm": 9.123685053830656, + "language_loss": 0.716048, + "learning_rate": 3.3207645941289063e-06, + "loss": 0.74434507, + "num_input_tokens_seen": 52062440, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.99023438, + "step": 2431, + "time_per_iteration": 3.0376527309417725 + }, + { + "auxiliary_loss_clip": 0.0150572, + "auxiliary_loss_mlp": 0.01344535, + "balance_loss_clip": 1.14381075, + "balance_loss_mlp": 1.04412723, + "epoch": 0.29243071003426924, + "flos": 35812050213600.0, + "grad_norm": 2.0460722092824195, + "language_loss": 0.80428731, + "learning_rate": 3.320179541118711e-06, + "loss": 0.8327899, + "num_input_tokens_seen": 52084940, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.99804688, + "step": 2432, + "time_per_iteration": 3.218311071395874 + }, + { + "auxiliary_loss_clip": 0.01530142, + "auxiliary_loss_mlp": 0.012285, + "balance_loss_clip": 1.164397, + "balance_loss_mlp": 0.99427795, + "epoch": 0.2925509529249083, + "flos": 58088732081760.0, + "grad_norm": 1.1712184881556762, + "language_loss": 0.60295904, + "learning_rate": 3.3195942878383293e-06, + "loss": 0.6305455, + "num_input_tokens_seen": 52141040, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.3359375, + "step": 2433, + "time_per_iteration": 5.16930079460144 + }, + { + "auxiliary_loss_clip": 0.01504897, + "auxiliary_loss_mlp": 0.01338164, + "balance_loss_clip": 1.14325142, + "balance_loss_mlp": 1.03508639, + "epoch": 0.2926711958155474, + "flos": 21399243435360.0, + "grad_norm": 2.1205728942119326, + "language_loss": 0.77870393, + "learning_rate": 3.319008834376543e-06, + "loss": 0.80713451, + "num_input_tokens_seen": 52160730, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 3.02539062, + "step": 2434, + "time_per_iteration": 3.0049936771392822 + }, + { + "auxiliary_loss_clip": 0.01500524, + "auxiliary_loss_mlp": 0.01330723, + "balance_loss_clip": 1.13862431, + "balance_loss_mlp": 1.03203189, + "epoch": 0.2927914387061865, + "flos": 23188067097120.0, + "grad_norm": 2.6862392064081755, + "language_loss": 0.88935888, + "learning_rate": 3.3184231808221654e-06, + "loss": 0.91767132, + "num_input_tokens_seen": 52175055, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.98242188, + "step": 2435, + "time_per_iteration": 2.9537410736083984 + }, + { + "auxiliary_loss_clip": 0.01503768, + "auxiliary_loss_mlp": 0.01338826, + "balance_loss_clip": 1.14037085, + "balance_loss_mlp": 1.03784609, + "epoch": 0.29291168159682557, + "flos": 22457753089920.0, + "grad_norm": 5.794996935765969, + "language_loss": 0.62840438, + "learning_rate": 3.3178373272640394e-06, + "loss": 0.65683031, + "num_input_tokens_seen": 52194150, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 3.00585938, + "step": 2436, + "time_per_iteration": 3.0376100540161133 + }, + { + "auxiliary_loss_clip": 0.01502885, + "auxiliary_loss_mlp": 0.0133184, + "balance_loss_clip": 1.14110339, + "balance_loss_mlp": 1.0325768, + "epoch": 0.2930319244874647, + "flos": 21172317068160.0, + "grad_norm": 2.2192451959709167, + "language_loss": 0.84683549, + "learning_rate": 3.3172512737910387e-06, + "loss": 0.87518269, + "num_input_tokens_seen": 52211660, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.98828125, + "step": 2437, + "time_per_iteration": 3.03822922706604 + }, + { + "auxiliary_loss_clip": 0.01504767, + "auxiliary_loss_mlp": 0.01342663, + "balance_loss_clip": 1.14362252, + "balance_loss_mlp": 1.04149234, + "epoch": 0.2931521673781038, + "flos": 31360529340000.0, + "grad_norm": 2.310701123821505, + "language_loss": 0.88253236, + "learning_rate": 3.3166650204920674e-06, + "loss": 0.91100663, + "num_input_tokens_seen": 52232830, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 3.0078125, + "step": 2438, + "time_per_iteration": 3.852139949798584 + }, + { + "auxiliary_loss_clip": 0.01505095, + "auxiliary_loss_mlp": 0.01347759, + "balance_loss_clip": 1.14387846, + "balance_loss_mlp": 1.04487228, + "epoch": 0.29327241026874284, + "flos": 24202959003360.0, + "grad_norm": 2.0848631217808222, + "language_loss": 0.81743419, + "learning_rate": 3.316078567456059e-06, + "loss": 0.84596276, + "num_input_tokens_seen": 52250670, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 3.02539062, + "step": 2439, + "time_per_iteration": 3.8908448219299316 + }, + { + "auxiliary_loss_clip": 0.01502831, + "auxiliary_loss_mlp": 0.01332727, + "balance_loss_clip": 1.14083362, + "balance_loss_mlp": 1.03270113, + "epoch": 0.29339265315938196, + "flos": 24244869984480.0, + "grad_norm": 1.6561907932181936, + "language_loss": 0.75535339, + "learning_rate": 3.3154919147719786e-06, + "loss": 0.78370899, + "num_input_tokens_seen": 52271685, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.99609375, + "step": 2440, + "time_per_iteration": 3.0379998683929443 + }, + { + "auxiliary_loss_clip": 0.01500981, + "auxiliary_loss_mlp": 0.01337913, + "balance_loss_clip": 1.14000618, + "balance_loss_mlp": 1.03807795, + "epoch": 0.29351289605002107, + "flos": 16948556981280.0, + "grad_norm": 14.208350593507065, + "language_loss": 0.8642019, + "learning_rate": 3.31490506252882e-06, + "loss": 0.89259088, + "num_input_tokens_seen": 52291065, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.9921875, + "step": 2441, + "time_per_iteration": 2.9565083980560303 + }, + { + "auxiliary_loss_clip": 0.01497015, + "auxiliary_loss_mlp": 0.0133955, + "balance_loss_clip": 1.13467669, + "balance_loss_mlp": 1.03990531, + "epoch": 0.2936331389406601, + "flos": 19831163850720.0, + "grad_norm": 1.9726523603640063, + "language_loss": 0.84122229, + "learning_rate": 3.31431801081561e-06, + "loss": 0.86958802, + "num_input_tokens_seen": 52310000, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.99023438, + "step": 2442, + "time_per_iteration": 3.018371105194092 + }, + { + "auxiliary_loss_clip": 0.01531594, + "auxiliary_loss_mlp": 0.01289917, + "balance_loss_clip": 1.16799712, + "balance_loss_mlp": 1.04653931, + "epoch": 0.29375338183129923, + "flos": 71423685843840.0, + "grad_norm": 0.9120181817873774, + "language_loss": 0.67822587, + "learning_rate": 3.313730759721402e-06, + "loss": 0.70644099, + "num_input_tokens_seen": 52372930, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.4296875, + "step": 2443, + "time_per_iteration": 3.5917556285858154 + }, + { + "auxiliary_loss_clip": 0.01506315, + "auxiliary_loss_mlp": 0.01332757, + "balance_loss_clip": 1.14539516, + "balance_loss_mlp": 1.03768992, + "epoch": 0.29387362472193834, + "flos": 22056696718560.0, + "grad_norm": 2.7386896075564735, + "language_loss": 0.86135656, + "learning_rate": 3.313143309335282e-06, + "loss": 0.88974726, + "num_input_tokens_seen": 52391420, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.953125, + "step": 2444, + "time_per_iteration": 2.9786934852600098 + }, + { + "auxiliary_loss_clip": 0.01505426, + "auxiliary_loss_mlp": 0.01316394, + "balance_loss_clip": 1.14354014, + "balance_loss_mlp": 1.02094531, + "epoch": 0.2939938676125774, + "flos": 22968650505600.0, + "grad_norm": 2.345020953574726, + "language_loss": 0.8479892, + "learning_rate": 3.3125556597463665e-06, + "loss": 0.87620735, + "num_input_tokens_seen": 52410725, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.95507812, + "step": 2445, + "time_per_iteration": 3.007267475128174 + }, + { + "auxiliary_loss_clip": 0.01509614, + "auxiliary_loss_mlp": 0.01332436, + "balance_loss_clip": 1.14881921, + "balance_loss_mlp": 1.03393555, + "epoch": 0.2941141105032165, + "flos": 31361439615840.0, + "grad_norm": 1.7661130122476068, + "language_loss": 0.6654529, + "learning_rate": 3.311967811043801e-06, + "loss": 0.69387347, + "num_input_tokens_seen": 52432645, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.98046875, + "step": 2446, + "time_per_iteration": 3.0248610973358154 + }, + { + "auxiliary_loss_clip": 0.01521058, + "auxiliary_loss_mlp": 0.01323674, + "balance_loss_clip": 1.16054773, + "balance_loss_mlp": 1.02860725, + "epoch": 0.29423435339385556, + "flos": 23224402638720.0, + "grad_norm": 2.3900031583814734, + "language_loss": 0.8171382, + "learning_rate": 3.3113797633167617e-06, + "loss": 0.84558547, + "num_input_tokens_seen": 52450940, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.94921875, + "step": 2447, + "time_per_iteration": 3.047945499420166 + }, + { + "auxiliary_loss_clip": 0.01514926, + "auxiliary_loss_mlp": 0.01330508, + "balance_loss_clip": 1.15397441, + "balance_loss_mlp": 1.03353429, + "epoch": 0.2943545962844947, + "flos": 26866149281280.0, + "grad_norm": 2.4213201771917623, + "language_loss": 0.68737739, + "learning_rate": 3.310791516654455e-06, + "loss": 0.71583176, + "num_input_tokens_seen": 52468000, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.96679688, + "step": 2448, + "time_per_iteration": 3.059932231903076 + }, + { + "auxiliary_loss_clip": 0.01524289, + "auxiliary_loss_mlp": 0.01345833, + "balance_loss_clip": 1.16438103, + "balance_loss_mlp": 1.04695177, + "epoch": 0.2944748391751338, + "flos": 20233851132960.0, + "grad_norm": 2.4406586936252905, + "language_loss": 0.79964721, + "learning_rate": 3.3102030711461177e-06, + "loss": 0.82834852, + "num_input_tokens_seen": 52487575, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.98242188, + "step": 2449, + "time_per_iteration": 3.006023406982422 + }, + { + "auxiliary_loss_clip": 0.01520489, + "auxiliary_loss_mlp": 0.01327416, + "balance_loss_clip": 1.16136479, + "balance_loss_mlp": 1.02853394, + "epoch": 0.29459508206577284, + "flos": 15962794266240.0, + "grad_norm": 2.0084693526138646, + "language_loss": 0.67607987, + "learning_rate": 3.3096144268810156e-06, + "loss": 0.70455891, + "num_input_tokens_seen": 52506335, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.984375, + "step": 2450, + "time_per_iteration": 3.079728126525879 + }, + { + "auxiliary_loss_clip": 0.01513427, + "auxiliary_loss_mlp": 0.01327794, + "balance_loss_clip": 1.15255785, + "balance_loss_mlp": 1.03368068, + "epoch": 0.29471532495641195, + "flos": 20414997846720.0, + "grad_norm": 1.9941225970803915, + "language_loss": 0.72788537, + "learning_rate": 3.3090255839484462e-06, + "loss": 0.75629759, + "num_input_tokens_seen": 52524330, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.93945312, + "step": 2451, + "time_per_iteration": 3.0128064155578613 + }, + { + "auxiliary_loss_clip": 0.01512363, + "auxiliary_loss_mlp": 0.01321528, + "balance_loss_clip": 1.15278924, + "balance_loss_mlp": 1.02665186, + "epoch": 0.29483556784705106, + "flos": 20378700233280.0, + "grad_norm": 2.8771487434882106, + "language_loss": 0.85487604, + "learning_rate": 3.3084365424377366e-06, + "loss": 0.88321495, + "num_input_tokens_seen": 52543095, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.9453125, + "step": 2452, + "time_per_iteration": 3.023791551589966 + }, + { + "auxiliary_loss_clip": 0.01554192, + "auxiliary_loss_mlp": 0.01249535, + "balance_loss_clip": 1.19351244, + "balance_loss_mlp": 1.01226044, + "epoch": 0.2949558107376901, + "flos": 68561484688800.0, + "grad_norm": 0.7694477858414076, + "language_loss": 0.55973995, + "learning_rate": 3.307847302438245e-06, + "loss": 0.58777726, + "num_input_tokens_seen": 52597075, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.3671875, + "step": 2453, + "time_per_iteration": 3.4744839668273926 + }, + { + "auxiliary_loss_clip": 0.01517788, + "auxiliary_loss_mlp": 0.01333867, + "balance_loss_clip": 1.15803528, + "balance_loss_mlp": 1.03651094, + "epoch": 0.2950760536283292, + "flos": 16108932924000.0, + "grad_norm": 2.8296831705142003, + "language_loss": 0.77774787, + "learning_rate": 3.3072578640393562e-06, + "loss": 0.80626446, + "num_input_tokens_seen": 52614410, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.97070312, + "step": 2454, + "time_per_iteration": 3.1071255207061768 + }, + { + "auxiliary_loss_clip": 0.01516796, + "auxiliary_loss_mlp": 0.01329913, + "balance_loss_clip": 1.15847361, + "balance_loss_mlp": 1.03427362, + "epoch": 0.29519629651896834, + "flos": 20485620809280.0, + "grad_norm": 1.8999854490242334, + "language_loss": 0.79838508, + "learning_rate": 3.3066682273304886e-06, + "loss": 0.8268522, + "num_input_tokens_seen": 52632055, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.953125, + "step": 2455, + "time_per_iteration": 3.119802236557007 + }, + { + "auxiliary_loss_clip": 0.01518935, + "auxiliary_loss_mlp": 0.01330956, + "balance_loss_clip": 1.160429, + "balance_loss_mlp": 1.03169322, + "epoch": 0.2953165394096074, + "flos": 18918489428640.0, + "grad_norm": 3.0527736545099535, + "language_loss": 0.79099488, + "learning_rate": 3.3060783924010904e-06, + "loss": 0.81949383, + "num_input_tokens_seen": 52649980, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.98828125, + "step": 2456, + "time_per_iteration": 3.0401604175567627 + }, + { + "auxiliary_loss_clip": 0.01513397, + "auxiliary_loss_mlp": 0.01335583, + "balance_loss_clip": 1.15456414, + "balance_loss_mlp": 1.0346036, + "epoch": 0.2954367823002465, + "flos": 20626184027520.0, + "grad_norm": 2.1133388325998883, + "language_loss": 0.84553504, + "learning_rate": 3.3054883593406387e-06, + "loss": 0.87402481, + "num_input_tokens_seen": 52664730, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 3.00390625, + "step": 2457, + "time_per_iteration": 3.034926414489746 + }, + { + "auxiliary_loss_clip": 0.01514577, + "auxiliary_loss_mlp": 0.01341072, + "balance_loss_clip": 1.15576088, + "balance_loss_mlp": 1.04142725, + "epoch": 0.2955570251908856, + "flos": 31178206853280.0, + "grad_norm": 2.8569115920756225, + "language_loss": 0.6530875, + "learning_rate": 3.3048981282386404e-06, + "loss": 0.68164396, + "num_input_tokens_seen": 52686040, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.9921875, + "step": 2458, + "time_per_iteration": 3.0682339668273926 + }, + { + "auxiliary_loss_clip": 0.01521226, + "auxiliary_loss_mlp": 0.01341695, + "balance_loss_clip": 1.16352415, + "balance_loss_mlp": 1.04147792, + "epoch": 0.29567726808152467, + "flos": 21652340597280.0, + "grad_norm": 3.1024380989446954, + "language_loss": 0.82559615, + "learning_rate": 3.304307699184634e-06, + "loss": 0.8542254, + "num_input_tokens_seen": 52704630, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.99804688, + "step": 2459, + "time_per_iteration": 3.8779456615448 + }, + { + "auxiliary_loss_clip": 0.01515511, + "auxiliary_loss_mlp": 0.01338224, + "balance_loss_clip": 1.15741682, + "balance_loss_mlp": 1.04048681, + "epoch": 0.2957975109721638, + "flos": 24246197470080.0, + "grad_norm": 2.1894681675431076, + "language_loss": 0.7870028, + "learning_rate": 3.3037170722681866e-06, + "loss": 0.81554013, + "num_input_tokens_seen": 52725465, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.97460938, + "step": 2460, + "time_per_iteration": 3.0514724254608154 + }, + { + "auxiliary_loss_clip": 0.01517139, + "auxiliary_loss_mlp": 0.01330259, + "balance_loss_clip": 1.1577822, + "balance_loss_mlp": 1.0296607, + "epoch": 0.29591775386280283, + "flos": 13481395480800.0, + "grad_norm": 2.007067179259726, + "language_loss": 0.68330896, + "learning_rate": 3.3031262475788956e-06, + "loss": 0.71178299, + "num_input_tokens_seen": 52742405, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 3.00195312, + "step": 2461, + "time_per_iteration": 3.903158664703369 + }, + { + "auxiliary_loss_clip": 0.01521223, + "auxiliary_loss_mlp": 0.01331512, + "balance_loss_clip": 1.16321337, + "balance_loss_mlp": 1.03282118, + "epoch": 0.29603799675344195, + "flos": 17751769640640.0, + "grad_norm": 1.9720734248132887, + "language_loss": 0.73243898, + "learning_rate": 3.3025352252063897e-06, + "loss": 0.7609663, + "num_input_tokens_seen": 52761100, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.98046875, + "step": 2462, + "time_per_iteration": 2.9957163333892822 + }, + { + "auxiliary_loss_clip": 0.01525123, + "auxiliary_loss_mlp": 0.01349447, + "balance_loss_clip": 1.16823721, + "balance_loss_mlp": 1.04789484, + "epoch": 0.29615823964408106, + "flos": 22786138378080.0, + "grad_norm": 2.083251017082594, + "language_loss": 0.75081998, + "learning_rate": 3.3019440052403252e-06, + "loss": 0.77956569, + "num_input_tokens_seen": 52780965, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 3.00976562, + "step": 2463, + "time_per_iteration": 2.9409961700439453 + }, + { + "auxiliary_loss_clip": 0.01516977, + "auxiliary_loss_mlp": 0.0134311, + "balance_loss_clip": 1.15836215, + "balance_loss_mlp": 1.04213047, + "epoch": 0.2962784825347201, + "flos": 23516490313440.0, + "grad_norm": 2.7237814175060313, + "language_loss": 0.71275282, + "learning_rate": 3.30135258777039e-06, + "loss": 0.74135369, + "num_input_tokens_seen": 52800335, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 3.00585938, + "step": 2464, + "time_per_iteration": 3.0110673904418945 + }, + { + "auxiliary_loss_clip": 0.01509549, + "auxiliary_loss_mlp": 0.01331088, + "balance_loss_clip": 1.15097857, + "balance_loss_mlp": 1.03068054, + "epoch": 0.2963987254253592, + "flos": 16364647128960.0, + "grad_norm": 3.0890242832270647, + "language_loss": 0.70453602, + "learning_rate": 3.3007609728863024e-06, + "loss": 0.73294234, + "num_input_tokens_seen": 52818425, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 3.0, + "step": 2465, + "time_per_iteration": 3.8872628211975098 + }, + { + "auxiliary_loss_clip": 0.01519322, + "auxiliary_loss_mlp": 0.01337815, + "balance_loss_clip": 1.16032529, + "balance_loss_mlp": 1.0387429, + "epoch": 0.29651896831599833, + "flos": 33474969534240.0, + "grad_norm": 2.0568957946175925, + "language_loss": 0.7309854, + "learning_rate": 3.300169160677809e-06, + "loss": 0.75955677, + "num_input_tokens_seen": 52842340, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.98632812, + "step": 2466, + "time_per_iteration": 3.9351799488067627 + }, + { + "auxiliary_loss_clip": 0.01523124, + "auxiliary_loss_mlp": 0.01345949, + "balance_loss_clip": 1.16530538, + "balance_loss_mlp": 1.04287124, + "epoch": 0.2966392112066374, + "flos": 23807629784160.0, + "grad_norm": 2.5558267317506345, + "language_loss": 0.77710688, + "learning_rate": 3.2995771512346878e-06, + "loss": 0.80579758, + "num_input_tokens_seen": 52860690, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 3.02734375, + "step": 2467, + "time_per_iteration": 3.148144006729126 + }, + { + "auxiliary_loss_clip": 0.01523832, + "auxiliary_loss_mlp": 0.01341, + "balance_loss_clip": 1.16533911, + "balance_loss_mlp": 1.03944838, + "epoch": 0.2967594540972765, + "flos": 19940549757120.0, + "grad_norm": 2.372644609863584, + "language_loss": 0.73273969, + "learning_rate": 3.298984944646746e-06, + "loss": 0.761388, + "num_input_tokens_seen": 52879370, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 3.01171875, + "step": 2468, + "time_per_iteration": 3.040360689163208 + }, + { + "auxiliary_loss_clip": 0.01522916, + "auxiliary_loss_mlp": 0.0133605, + "balance_loss_clip": 1.1651907, + "balance_loss_mlp": 1.03602409, + "epoch": 0.2968796969879156, + "flos": 23735110413600.0, + "grad_norm": 1.9773850214607132, + "language_loss": 0.81853926, + "learning_rate": 3.298392541003822e-06, + "loss": 0.84712893, + "num_input_tokens_seen": 52898775, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.99414062, + "step": 2469, + "time_per_iteration": 3.091559886932373 + }, + { + "auxiliary_loss_clip": 0.01517854, + "auxiliary_loss_mlp": 0.01327578, + "balance_loss_clip": 1.15962791, + "balance_loss_mlp": 1.03060389, + "epoch": 0.29699993987855466, + "flos": 22895941494240.0, + "grad_norm": 1.6333106852833985, + "language_loss": 0.89860582, + "learning_rate": 3.2977999403957806e-06, + "loss": 0.92706019, + "num_input_tokens_seen": 52917535, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.96679688, + "step": 2470, + "time_per_iteration": 3.0135302543640137 + }, + { + "auxiliary_loss_clip": 0.01525965, + "auxiliary_loss_mlp": 0.01325316, + "balance_loss_clip": 1.16622329, + "balance_loss_mlp": 1.03101158, + "epoch": 0.2971201827691938, + "flos": 33835025200320.0, + "grad_norm": 4.205654379792684, + "language_loss": 0.67493469, + "learning_rate": 3.2972071429125207e-06, + "loss": 0.70344746, + "num_input_tokens_seen": 52938755, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.94335938, + "step": 2471, + "time_per_iteration": 3.1660408973693848 + }, + { + "auxiliary_loss_clip": 0.01519104, + "auxiliary_loss_mlp": 0.01327151, + "balance_loss_clip": 1.15902603, + "balance_loss_mlp": 1.02922344, + "epoch": 0.2972404256598329, + "flos": 22056545005920.0, + "grad_norm": 2.4474385231936826, + "language_loss": 0.88715518, + "learning_rate": 3.2966141486439682e-06, + "loss": 0.9156177, + "num_input_tokens_seen": 52957945, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.97460938, + "step": 2472, + "time_per_iteration": 3.013960599899292 + }, + { + "auxiliary_loss_clip": 0.01521874, + "auxiliary_loss_mlp": 0.01331454, + "balance_loss_clip": 1.16175556, + "balance_loss_mlp": 1.03161848, + "epoch": 0.29736066855047194, + "flos": 31981343656320.0, + "grad_norm": 2.896032784160502, + "language_loss": 0.64637262, + "learning_rate": 3.29602095768008e-06, + "loss": 0.6749059, + "num_input_tokens_seen": 52978460, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.99414062, + "step": 2473, + "time_per_iteration": 3.2104456424713135 + }, + { + "auxiliary_loss_clip": 0.01523996, + "auxiliary_loss_mlp": 0.0132902, + "balance_loss_clip": 1.16407442, + "balance_loss_mlp": 1.0360508, + "epoch": 0.29748091144111105, + "flos": 33513201483840.0, + "grad_norm": 2.295615981623216, + "language_loss": 0.63970244, + "learning_rate": 3.2954275701108437e-06, + "loss": 0.66823262, + "num_input_tokens_seen": 52999640, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.9296875, + "step": 2474, + "time_per_iteration": 3.043020009994507 + }, + { + "auxiliary_loss_clip": 0.01521051, + "auxiliary_loss_mlp": 0.01327896, + "balance_loss_clip": 1.16124034, + "balance_loss_mlp": 1.02996862, + "epoch": 0.29760115433175016, + "flos": 41286693404160.0, + "grad_norm": 2.0078780435315986, + "language_loss": 0.68786204, + "learning_rate": 3.294833986026275e-06, + "loss": 0.71635151, + "num_input_tokens_seen": 53022880, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.97460938, + "step": 2475, + "time_per_iteration": 3.213585138320923 + }, + { + "auxiliary_loss_clip": 0.01523142, + "auxiliary_loss_mlp": 0.01339406, + "balance_loss_clip": 1.16324449, + "balance_loss_mlp": 1.04643703, + "epoch": 0.2977213972223892, + "flos": 24495350103360.0, + "grad_norm": 1.9077228791361172, + "language_loss": 0.85447907, + "learning_rate": 3.29424020551642e-06, + "loss": 0.88310456, + "num_input_tokens_seen": 53041515, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.9296875, + "step": 2476, + "time_per_iteration": 3.107834577560425 + }, + { + "auxiliary_loss_clip": 0.01524521, + "auxiliary_loss_mlp": 0.01328591, + "balance_loss_clip": 1.16484082, + "balance_loss_mlp": 1.0302819, + "epoch": 0.2978416401130283, + "flos": 21287088773280.0, + "grad_norm": 2.3575902133841975, + "language_loss": 0.72145855, + "learning_rate": 3.2936462286713546e-06, + "loss": 0.74998969, + "num_input_tokens_seen": 53059865, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.98046875, + "step": 2477, + "time_per_iteration": 3.1080663204193115 + }, + { + "auxiliary_loss_clip": 0.01520083, + "auxiliary_loss_mlp": 0.01324671, + "balance_loss_clip": 1.15829921, + "balance_loss_mlp": 1.0265522, + "epoch": 0.2979618830036674, + "flos": 25774338337920.0, + "grad_norm": 2.851398283888858, + "language_loss": 0.77208722, + "learning_rate": 3.2930520555811846e-06, + "loss": 0.80053473, + "num_input_tokens_seen": 53079490, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.97851562, + "step": 2478, + "time_per_iteration": 3.031033754348755 + }, + { + "auxiliary_loss_clip": 0.01519259, + "auxiliary_loss_mlp": 0.01333184, + "balance_loss_clip": 1.15839887, + "balance_loss_mlp": 1.03544652, + "epoch": 0.2980821258943065, + "flos": 23479244496000.0, + "grad_norm": 1.9938117452683268, + "language_loss": 0.8018167, + "learning_rate": 3.292457686336046e-06, + "loss": 0.8303411, + "num_input_tokens_seen": 53098810, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.97460938, + "step": 2479, + "time_per_iteration": 2.9449234008789062 + }, + { + "auxiliary_loss_clip": 0.01607012, + "auxiliary_loss_mlp": 0.01281998, + "balance_loss_clip": 1.24499059, + "balance_loss_mlp": 1.04167175, + "epoch": 0.2982023687849456, + "flos": 69759836562240.0, + "grad_norm": 0.8720477176474807, + "language_loss": 0.61208647, + "learning_rate": 3.291863121026105e-06, + "loss": 0.64097655, + "num_input_tokens_seen": 53162590, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.3984375, + "step": 2480, + "time_per_iteration": 3.568075180053711 + }, + { + "auxiliary_loss_clip": 0.0152027, + "auxiliary_loss_mlp": 0.01334094, + "balance_loss_clip": 1.16014552, + "balance_loss_mlp": 1.03483057, + "epoch": 0.29832261167558466, + "flos": 29828557728000.0, + "grad_norm": 2.7612521782020383, + "language_loss": 0.76656175, + "learning_rate": 3.2912683597415547e-06, + "loss": 0.7951054, + "num_input_tokens_seen": 53186675, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.98828125, + "step": 2481, + "time_per_iteration": 3.0835680961608887 + }, + { + "auxiliary_loss_clip": 0.01522988, + "auxiliary_loss_mlp": 0.01329226, + "balance_loss_clip": 1.16210663, + "balance_loss_mlp": 1.03187013, + "epoch": 0.29844285456622377, + "flos": 33912892441440.0, + "grad_norm": 4.091086790744296, + "language_loss": 0.78281701, + "learning_rate": 3.2906734025726213e-06, + "loss": 0.81133914, + "num_input_tokens_seen": 53205940, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.96875, + "step": 2482, + "time_per_iteration": 3.0279319286346436 + }, + { + "auxiliary_loss_clip": 0.01524299, + "auxiliary_loss_mlp": 0.01339327, + "balance_loss_clip": 1.16314757, + "balance_loss_mlp": 1.04025507, + "epoch": 0.2985630974568629, + "flos": 23879125094400.0, + "grad_norm": 4.021832265140007, + "language_loss": 0.87868553, + "learning_rate": 3.290078249609559e-06, + "loss": 0.90732169, + "num_input_tokens_seen": 53225360, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.984375, + "step": 2483, + "time_per_iteration": 3.0244832038879395 + }, + { + "auxiliary_loss_clip": 0.01524299, + "auxiliary_loss_mlp": 0.01329864, + "balance_loss_clip": 1.16452861, + "balance_loss_mlp": 1.03346217, + "epoch": 0.29868334034750194, + "flos": 21801285938880.0, + "grad_norm": 2.1260510548919394, + "language_loss": 0.88145018, + "learning_rate": 3.2894829009426514e-06, + "loss": 0.9099918, + "num_input_tokens_seen": 53243195, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.9609375, + "step": 2484, + "time_per_iteration": 2.9929792881011963 + }, + { + "auxiliary_loss_clip": 0.01528629, + "auxiliary_loss_mlp": 0.01325757, + "balance_loss_clip": 1.16888857, + "balance_loss_mlp": 1.02706647, + "epoch": 0.29880358323814105, + "flos": 25669048672800.0, + "grad_norm": 2.634636103826742, + "language_loss": 0.77845073, + "learning_rate": 3.288887356662213e-06, + "loss": 0.80699456, + "num_input_tokens_seen": 53264530, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.98242188, + "step": 2485, + "time_per_iteration": 3.0450029373168945 + }, + { + "auxiliary_loss_clip": 0.01604589, + "auxiliary_loss_mlp": 0.0123275, + "balance_loss_clip": 1.24286366, + "balance_loss_mlp": 0.99623871, + "epoch": 0.29892382612878016, + "flos": 71012123372160.0, + "grad_norm": 0.8954670947265898, + "language_loss": 0.59673667, + "learning_rate": 3.288291616858588e-06, + "loss": 0.62511015, + "num_input_tokens_seen": 53319920, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.359375, + "step": 2486, + "time_per_iteration": 4.239594221115112 + }, + { + "auxiliary_loss_clip": 0.01521066, + "auxiliary_loss_mlp": 0.01328418, + "balance_loss_clip": 1.16097426, + "balance_loss_mlp": 1.02896464, + "epoch": 0.2990440690194192, + "flos": 25483350579840.0, + "grad_norm": 2.05656140729003, + "language_loss": 0.76996136, + "learning_rate": 3.287695681622149e-06, + "loss": 0.79845613, + "num_input_tokens_seen": 53339270, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.99023438, + "step": 2487, + "time_per_iteration": 3.2090742588043213 + }, + { + "auxiliary_loss_clip": 0.01514015, + "auxiliary_loss_mlp": 0.01335507, + "balance_loss_clip": 1.15255833, + "balance_loss_mlp": 1.04024959, + "epoch": 0.2991643119100583, + "flos": 23734617347520.0, + "grad_norm": 2.032634038991613, + "language_loss": 0.80904889, + "learning_rate": 3.2870995510432982e-06, + "loss": 0.83754408, + "num_input_tokens_seen": 53357750, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.95117188, + "step": 2488, + "time_per_iteration": 4.070345640182495 + }, + { + "auxiliary_loss_clip": 0.01517867, + "auxiliary_loss_mlp": 0.013201, + "balance_loss_clip": 1.15683758, + "balance_loss_mlp": 1.02732241, + "epoch": 0.29928455480069743, + "flos": 27420019666560.0, + "grad_norm": 2.0763355077719243, + "language_loss": 0.77103549, + "learning_rate": 3.2865032252124697e-06, + "loss": 0.79941517, + "num_input_tokens_seen": 53378265, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.92578125, + "step": 2489, + "time_per_iteration": 3.1097640991210938 + }, + { + "auxiliary_loss_clip": 0.01514894, + "auxiliary_loss_mlp": 0.01320837, + "balance_loss_clip": 1.15432358, + "balance_loss_mlp": 1.02596092, + "epoch": 0.2994047976913365, + "flos": 33695751539520.0, + "grad_norm": 1.8279748139986798, + "language_loss": 0.77662587, + "learning_rate": 3.2859067042201243e-06, + "loss": 0.80498314, + "num_input_tokens_seen": 53400305, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.9453125, + "step": 2490, + "time_per_iteration": 3.2200207710266113 + }, + { + "auxiliary_loss_clip": 0.01514796, + "auxiliary_loss_mlp": 0.01322323, + "balance_loss_clip": 1.15460336, + "balance_loss_mlp": 1.02763748, + "epoch": 0.2995250405819756, + "flos": 16765855212960.0, + "grad_norm": 2.5916250478070157, + "language_loss": 0.77987111, + "learning_rate": 3.2853099881567544e-06, + "loss": 0.80824226, + "num_input_tokens_seen": 53418705, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.94726562, + "step": 2491, + "time_per_iteration": 3.018193006515503 + }, + { + "auxiliary_loss_clip": 0.01518429, + "auxiliary_loss_mlp": 0.01329078, + "balance_loss_clip": 1.15720868, + "balance_loss_mlp": 1.03324819, + "epoch": 0.29964528347261465, + "flos": 22966071390720.0, + "grad_norm": 2.36090269683526, + "language_loss": 0.79343551, + "learning_rate": 3.284713077112881e-06, + "loss": 0.82191056, + "num_input_tokens_seen": 53438135, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.95703125, + "step": 2492, + "time_per_iteration": 3.1897435188293457 + }, + { + "auxiliary_loss_clip": 0.015302, + "auxiliary_loss_mlp": 0.01332668, + "balance_loss_clip": 1.1687119, + "balance_loss_mlp": 1.03493118, + "epoch": 0.29976552636325376, + "flos": 16939454222880.0, + "grad_norm": 8.321700742492173, + "language_loss": 0.86922973, + "learning_rate": 3.284115971179056e-06, + "loss": 0.89785844, + "num_input_tokens_seen": 53452165, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.97265625, + "step": 2493, + "time_per_iteration": 3.9841485023498535 + }, + { + "auxiliary_loss_clip": 0.01523179, + "auxiliary_loss_mlp": 0.01327637, + "balance_loss_clip": 1.1621176, + "balance_loss_mlp": 1.03257036, + "epoch": 0.2998857692538929, + "flos": 17058398025600.0, + "grad_norm": 1.9411686611021435, + "language_loss": 0.78819859, + "learning_rate": 3.283518670445859e-06, + "loss": 0.81670678, + "num_input_tokens_seen": 53470075, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.95117188, + "step": 2494, + "time_per_iteration": 3.772165298461914 + }, + { + "auxiliary_loss_clip": 0.01614156, + "auxiliary_loss_mlp": 0.01227219, + "balance_loss_clip": 1.25484705, + "balance_loss_mlp": 0.99299622, + "epoch": 0.30000601214453193, + "flos": 68838514519680.0, + "grad_norm": 0.6917076744280826, + "language_loss": 0.54247922, + "learning_rate": 3.2829211750038995e-06, + "loss": 0.57089293, + "num_input_tokens_seen": 53538705, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.3359375, + "step": 2495, + "time_per_iteration": 3.505272150039673 + }, + { + "auxiliary_loss_clip": 0.01521839, + "auxiliary_loss_mlp": 0.01336634, + "balance_loss_clip": 1.16161871, + "balance_loss_mlp": 1.03660822, + "epoch": 0.30012625503517104, + "flos": 17605213773120.0, + "grad_norm": 1.8243194926280546, + "language_loss": 0.89515233, + "learning_rate": 3.2823234849438183e-06, + "loss": 0.92373711, + "num_input_tokens_seen": 53556740, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.99609375, + "step": 2496, + "time_per_iteration": 3.101142644882202 + }, + { + "auxiliary_loss_clip": 0.01524332, + "auxiliary_loss_mlp": 0.01319444, + "balance_loss_clip": 1.16425633, + "balance_loss_mlp": 1.02628481, + "epoch": 0.30024649792581015, + "flos": 21254773616640.0, + "grad_norm": 2.5522376211285374, + "language_loss": 0.7577951, + "learning_rate": 3.2817256003562836e-06, + "loss": 0.78623283, + "num_input_tokens_seen": 53577115, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.93359375, + "step": 2497, + "time_per_iteration": 3.0185916423797607 + }, + { + "auxiliary_loss_clip": 0.015296, + "auxiliary_loss_mlp": 0.01349465, + "balance_loss_clip": 1.17138886, + "balance_loss_mlp": 1.05096531, + "epoch": 0.3003667408164492, + "flos": 23005517041440.0, + "grad_norm": 1.7462082446828433, + "language_loss": 0.65900159, + "learning_rate": 3.281127521331995e-06, + "loss": 0.68779224, + "num_input_tokens_seen": 53598295, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.98242188, + "step": 2498, + "time_per_iteration": 3.081672191619873 + }, + { + "auxiliary_loss_clip": 0.01620454, + "auxiliary_loss_mlp": 0.01223961, + "balance_loss_clip": 1.26233208, + "balance_loss_mlp": 0.98592377, + "epoch": 0.3004869837070883, + "flos": 64238958580320.0, + "grad_norm": 0.9256635164278125, + "language_loss": 0.60726464, + "learning_rate": 3.2805292479616798e-06, + "loss": 0.63570869, + "num_input_tokens_seen": 53657160, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.375, + "step": 2499, + "time_per_iteration": 3.353055238723755 + }, + { + "auxiliary_loss_clip": 0.0153065, + "auxiliary_loss_mlp": 0.01343241, + "balance_loss_clip": 1.17137277, + "balance_loss_mlp": 1.04302454, + "epoch": 0.30060722659772743, + "flos": 26250986260800.0, + "grad_norm": 2.388807620195976, + "language_loss": 0.91995835, + "learning_rate": 3.2799307803360955e-06, + "loss": 0.94869733, + "num_input_tokens_seen": 53673090, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.99804688, + "step": 2500, + "time_per_iteration": 2.9560582637786865 + }, + { + "auxiliary_loss_clip": 0.01517767, + "auxiliary_loss_mlp": 0.01319169, + "balance_loss_clip": 1.15655005, + "balance_loss_mlp": 1.02600932, + "epoch": 0.3007274694883665, + "flos": 24973097942880.0, + "grad_norm": 1.6183835562606945, + "language_loss": 0.81692851, + "learning_rate": 3.27933211854603e-06, + "loss": 0.84529787, + "num_input_tokens_seen": 53692145, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.93359375, + "step": 2501, + "time_per_iteration": 3.1376466751098633 + }, + { + "auxiliary_loss_clip": 0.01519774, + "auxiliary_loss_mlp": 0.0132839, + "balance_loss_clip": 1.15967846, + "balance_loss_mlp": 1.02969933, + "epoch": 0.3008477123790056, + "flos": 17057753246880.0, + "grad_norm": 2.0387229949168213, + "language_loss": 0.87151492, + "learning_rate": 3.278733262682299e-06, + "loss": 0.89999658, + "num_input_tokens_seen": 53710000, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.98242188, + "step": 2502, + "time_per_iteration": 2.947866678237915 + }, + { + "auxiliary_loss_clip": 0.01517295, + "auxiliary_loss_mlp": 0.01328596, + "balance_loss_clip": 1.15793586, + "balance_loss_mlp": 1.0314312, + "epoch": 0.3009679552696447, + "flos": 21508515557280.0, + "grad_norm": 2.331141935270598, + "language_loss": 0.8283006, + "learning_rate": 3.2781342128357484e-06, + "loss": 0.85675955, + "num_input_tokens_seen": 53729355, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.97070312, + "step": 2503, + "time_per_iteration": 2.9929122924804688 + }, + { + "auxiliary_loss_clip": 0.01527157, + "auxiliary_loss_mlp": 0.01341823, + "balance_loss_clip": 1.16866767, + "balance_loss_mlp": 1.04294145, + "epoch": 0.30108819816028376, + "flos": 21135678101280.0, + "grad_norm": 3.6827011074091516, + "language_loss": 0.80556679, + "learning_rate": 3.2775349690972547e-06, + "loss": 0.83425665, + "num_input_tokens_seen": 53743505, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.984375, + "step": 2504, + "time_per_iteration": 3.07802414894104 + }, + { + "auxiliary_loss_clip": 0.01629128, + "auxiliary_loss_mlp": 0.01287628, + "balance_loss_clip": 1.27352738, + "balance_loss_mlp": 1.04806519, + "epoch": 0.30120844105092287, + "flos": 71133342864480.0, + "grad_norm": 0.851028278817965, + "language_loss": 0.51728654, + "learning_rate": 3.276935531557722e-06, + "loss": 0.54645407, + "num_input_tokens_seen": 53808725, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.390625, + "step": 2505, + "time_per_iteration": 3.617236852645874 + }, + { + "auxiliary_loss_clip": 0.01529209, + "auxiliary_loss_mlp": 0.01334551, + "balance_loss_clip": 1.17025113, + "balance_loss_mlp": 1.03700507, + "epoch": 0.301328683941562, + "flos": 20266545571200.0, + "grad_norm": 2.8057670484903783, + "language_loss": 0.79673505, + "learning_rate": 3.2763359003080837e-06, + "loss": 0.8253727, + "num_input_tokens_seen": 53825680, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.97265625, + "step": 2506, + "time_per_iteration": 3.0373470783233643 + }, + { + "auxiliary_loss_clip": 0.01627786, + "auxiliary_loss_mlp": 0.01254021, + "balance_loss_clip": 1.27147985, + "balance_loss_mlp": 1.01522064, + "epoch": 0.30144892683220104, + "flos": 70654874389920.0, + "grad_norm": 0.8372594048030472, + "language_loss": 0.62426037, + "learning_rate": 3.2757360754393047e-06, + "loss": 0.65307844, + "num_input_tokens_seen": 53889750, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3828125, + "step": 2507, + "time_per_iteration": 3.5672361850738525 + }, + { + "auxiliary_loss_clip": 0.01523457, + "auxiliary_loss_mlp": 0.01341941, + "balance_loss_clip": 1.16327178, + "balance_loss_mlp": 1.04458511, + "epoch": 0.30156916972284015, + "flos": 22822777344960.0, + "grad_norm": 4.070918171105745, + "language_loss": 0.64069641, + "learning_rate": 3.2751360570423767e-06, + "loss": 0.66935039, + "num_input_tokens_seen": 53908135, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.97070312, + "step": 2508, + "time_per_iteration": 3.058184862136841 + }, + { + "auxiliary_loss_clip": 0.01521255, + "auxiliary_loss_mlp": 0.01327198, + "balance_loss_clip": 1.16088617, + "balance_loss_mlp": 1.02812541, + "epoch": 0.3016894126134792, + "flos": 29901835661760.0, + "grad_norm": 2.544818682363127, + "language_loss": 0.76151478, + "learning_rate": 3.2745358452083236e-06, + "loss": 0.78999937, + "num_input_tokens_seen": 53931035, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.98828125, + "step": 2509, + "time_per_iteration": 3.170471429824829 + }, + { + "auxiliary_loss_clip": 0.01522109, + "auxiliary_loss_mlp": 0.01325871, + "balance_loss_clip": 1.16337729, + "balance_loss_mlp": 1.03137672, + "epoch": 0.3018096555041183, + "flos": 21548643914880.0, + "grad_norm": 1.4423928184238561, + "language_loss": 0.82473856, + "learning_rate": 3.2739354400281955e-06, + "loss": 0.85321838, + "num_input_tokens_seen": 53952255, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.94140625, + "step": 2510, + "time_per_iteration": 3.2362611293792725 + }, + { + "auxiliary_loss_clip": 0.01616516, + "auxiliary_loss_mlp": 0.01214409, + "balance_loss_clip": 1.25948858, + "balance_loss_mlp": 0.98323822, + "epoch": 0.3019298983947574, + "flos": 59143297207680.0, + "grad_norm": 0.8787908723322605, + "language_loss": 0.63587391, + "learning_rate": 3.2733348415930744e-06, + "loss": 0.6641832, + "num_input_tokens_seen": 54014125, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.3046875, + "step": 2511, + "time_per_iteration": 3.45133113861084 + }, + { + "auxiliary_loss_clip": 0.01514232, + "auxiliary_loss_mlp": 0.0132214, + "balance_loss_clip": 1.15462923, + "balance_loss_mlp": 1.02611971, + "epoch": 0.3020501412853965, + "flos": 34425951762240.0, + "grad_norm": 2.1764400615481208, + "language_loss": 0.80953515, + "learning_rate": 3.27273404999407e-06, + "loss": 0.83789885, + "num_input_tokens_seen": 54036345, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.95898438, + "step": 2512, + "time_per_iteration": 3.1866371631622314 + }, + { + "auxiliary_loss_clip": 0.01605624, + "auxiliary_loss_mlp": 0.0122142, + "balance_loss_clip": 1.24867392, + "balance_loss_mlp": 0.99024963, + "epoch": 0.3021703841760356, + "flos": 71014702487040.0, + "grad_norm": 0.8097566077525568, + "language_loss": 0.60395885, + "learning_rate": 3.272133065322322e-06, + "loss": 0.63222933, + "num_input_tokens_seen": 54094615, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.3046875, + "step": 2513, + "time_per_iteration": 3.4231107234954834 + }, + { + "auxiliary_loss_clip": 0.01514519, + "auxiliary_loss_mlp": 0.01326508, + "balance_loss_clip": 1.15505445, + "balance_loss_mlp": 1.03334892, + "epoch": 0.3022906270666747, + "flos": 21512877295680.0, + "grad_norm": 1.9259114301510207, + "language_loss": 0.79743826, + "learning_rate": 3.271531887669e-06, + "loss": 0.82584858, + "num_input_tokens_seen": 54114675, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.93359375, + "step": 2514, + "time_per_iteration": 3.854309320449829 + }, + { + "auxiliary_loss_clip": 0.0151997, + "auxiliary_loss_mlp": 0.01315948, + "balance_loss_clip": 1.16277146, + "balance_loss_mlp": 1.02240717, + "epoch": 0.30241086995731375, + "flos": 31134285679680.0, + "grad_norm": 4.042978507651114, + "language_loss": 0.63286984, + "learning_rate": 3.2709305171253015e-06, + "loss": 0.66122901, + "num_input_tokens_seen": 54134795, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.93554688, + "step": 2515, + "time_per_iteration": 3.921666383743286 + }, + { + "auxiliary_loss_clip": 0.01523473, + "auxiliary_loss_mlp": 0.01333677, + "balance_loss_clip": 1.16409886, + "balance_loss_mlp": 1.04051745, + "epoch": 0.30253111284795287, + "flos": 23513683629600.0, + "grad_norm": 2.781074883687026, + "language_loss": 0.78289837, + "learning_rate": 3.2703289537824536e-06, + "loss": 0.81146991, + "num_input_tokens_seen": 54154595, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.93164062, + "step": 2516, + "time_per_iteration": 3.0467028617858887 + }, + { + "auxiliary_loss_clip": 0.01523746, + "auxiliary_loss_mlp": 0.01338796, + "balance_loss_clip": 1.1663847, + "balance_loss_mlp": 1.04124951, + "epoch": 0.302651355738592, + "flos": 18726874542720.0, + "grad_norm": 2.6420344823018715, + "language_loss": 0.79467213, + "learning_rate": 3.269727197731714e-06, + "loss": 0.8232975, + "num_input_tokens_seen": 54167360, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.97070312, + "step": 2517, + "time_per_iteration": 2.97064471244812 + }, + { + "auxiliary_loss_clip": 0.01517992, + "auxiliary_loss_mlp": 0.0133591, + "balance_loss_clip": 1.15912962, + "balance_loss_mlp": 1.03912663, + "epoch": 0.30277159862923103, + "flos": 22421076194880.0, + "grad_norm": 1.6639051277484842, + "language_loss": 0.78182954, + "learning_rate": 3.269125249064367e-06, + "loss": 0.81036854, + "num_input_tokens_seen": 54187055, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.96484375, + "step": 2518, + "time_per_iteration": 2.956094980239868 + }, + { + "auxiliary_loss_clip": 0.01518616, + "auxiliary_loss_mlp": 0.01327784, + "balance_loss_clip": 1.15996575, + "balance_loss_mlp": 1.03214538, + "epoch": 0.30289184151987014, + "flos": 22275316818720.0, + "grad_norm": 1.7155373594414467, + "language_loss": 0.83169055, + "learning_rate": 3.2685231078717297e-06, + "loss": 0.86015451, + "num_input_tokens_seen": 54207245, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.95507812, + "step": 2519, + "time_per_iteration": 3.0741305351257324 + }, + { + "auxiliary_loss_clip": 0.01524921, + "auxiliary_loss_mlp": 0.01318241, + "balance_loss_clip": 1.16575134, + "balance_loss_mlp": 1.02336502, + "epoch": 0.30301208441050925, + "flos": 25227788087520.0, + "grad_norm": 5.10595149869452, + "language_loss": 0.75165063, + "learning_rate": 3.267920774245145e-06, + "loss": 0.78008229, + "num_input_tokens_seen": 54226650, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.9453125, + "step": 2520, + "time_per_iteration": 3.869702100753784 + }, + { + "auxiliary_loss_clip": 0.01523474, + "auxiliary_loss_mlp": 0.0134174, + "balance_loss_clip": 1.16505313, + "balance_loss_mlp": 1.04457533, + "epoch": 0.3031323273011483, + "flos": 23041473301440.0, + "grad_norm": 1.849870348880317, + "language_loss": 0.84572923, + "learning_rate": 3.2673182482759876e-06, + "loss": 0.87438142, + "num_input_tokens_seen": 54245765, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.97265625, + "step": 2521, + "time_per_iteration": 3.7972543239593506 + }, + { + "auxiliary_loss_clip": 0.0152064, + "auxiliary_loss_mlp": 0.01320334, + "balance_loss_clip": 1.16094756, + "balance_loss_mlp": 1.02583933, + "epoch": 0.3032525701917874, + "flos": 18878512783680.0, + "grad_norm": 1.748555686361795, + "language_loss": 0.6622324, + "learning_rate": 3.266715530055659e-06, + "loss": 0.69064218, + "num_input_tokens_seen": 54263915, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.9453125, + "step": 2522, + "time_per_iteration": 3.045269012451172 + }, + { + "auxiliary_loss_clip": 0.01518683, + "auxiliary_loss_mlp": 0.01330465, + "balance_loss_clip": 1.15916228, + "balance_loss_mlp": 1.03539813, + "epoch": 0.30337281308242653, + "flos": 17784426150720.0, + "grad_norm": 4.67786010199993, + "language_loss": 0.8096019, + "learning_rate": 3.2661126196755927e-06, + "loss": 0.8380934, + "num_input_tokens_seen": 54283025, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.94921875, + "step": 2523, + "time_per_iteration": 3.00870418548584 + }, + { + "auxiliary_loss_clip": 0.01607553, + "auxiliary_loss_mlp": 0.01240265, + "balance_loss_clip": 1.24999571, + "balance_loss_mlp": 1.00604248, + "epoch": 0.3034930559730656, + "flos": 57831311109600.0, + "grad_norm": 0.7829524746561206, + "language_loss": 0.5588116, + "learning_rate": 3.265509517227248e-06, + "loss": 0.58728981, + "num_input_tokens_seen": 54339840, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.3359375, + "step": 2524, + "time_per_iteration": 3.590834856033325 + }, + { + "auxiliary_loss_clip": 0.01519473, + "auxiliary_loss_mlp": 0.01329389, + "balance_loss_clip": 1.16042686, + "balance_loss_mlp": 1.03394067, + "epoch": 0.3036132988637047, + "flos": 14757273606240.0, + "grad_norm": 1.9638489571493445, + "language_loss": 0.8060655, + "learning_rate": 3.264906222802115e-06, + "loss": 0.83455414, + "num_input_tokens_seen": 54357690, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.953125, + "step": 2525, + "time_per_iteration": 3.099274158477783 + }, + { + "auxiliary_loss_clip": 0.01515035, + "auxiliary_loss_mlp": 0.01314326, + "balance_loss_clip": 1.15429401, + "balance_loss_mlp": 1.02059448, + "epoch": 0.30373354175434375, + "flos": 21035357025120.0, + "grad_norm": 2.4718443730183988, + "language_loss": 0.78606725, + "learning_rate": 3.264302736491715e-06, + "loss": 0.81436086, + "num_input_tokens_seen": 54377810, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.93945312, + "step": 2526, + "time_per_iteration": 3.041351318359375 + }, + { + "auxiliary_loss_clip": 0.01522946, + "auxiliary_loss_mlp": 0.01320846, + "balance_loss_clip": 1.1638279, + "balance_loss_mlp": 1.02844965, + "epoch": 0.30385378464498286, + "flos": 21145425638400.0, + "grad_norm": 2.482904318012668, + "language_loss": 0.87485552, + "learning_rate": 3.263699058387594e-06, + "loss": 0.90329349, + "num_input_tokens_seen": 54395245, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.92578125, + "step": 2527, + "time_per_iteration": 3.1207966804504395 + }, + { + "auxiliary_loss_clip": 0.01517316, + "auxiliary_loss_mlp": 0.01328714, + "balance_loss_clip": 1.15745735, + "balance_loss_mlp": 1.03250277, + "epoch": 0.30397402753562197, + "flos": 20631228472800.0, + "grad_norm": 2.480707798109495, + "language_loss": 0.90254796, + "learning_rate": 3.2630951885813315e-06, + "loss": 0.93100822, + "num_input_tokens_seen": 54412640, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.95703125, + "step": 2528, + "time_per_iteration": 3.053849935531616 + }, + { + "auxiliary_loss_clip": 0.01520222, + "auxiliary_loss_mlp": 0.01328335, + "balance_loss_clip": 1.15998387, + "balance_loss_mlp": 1.03250563, + "epoch": 0.304094270426261, + "flos": 15087479446080.0, + "grad_norm": 2.447639771349057, + "language_loss": 0.78007984, + "learning_rate": 3.262491127164533e-06, + "loss": 0.8085655, + "num_input_tokens_seen": 54431455, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.95703125, + "step": 2529, + "time_per_iteration": 3.0267982482910156 + }, + { + "auxiliary_loss_clip": 0.01519052, + "auxiliary_loss_mlp": 0.01329831, + "balance_loss_clip": 1.15829349, + "balance_loss_mlp": 1.03438234, + "epoch": 0.30421451331690014, + "flos": 13846950730080.0, + "grad_norm": 2.862515739503949, + "language_loss": 0.80063069, + "learning_rate": 3.2618868742288337e-06, + "loss": 0.82911944, + "num_input_tokens_seen": 54448380, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.953125, + "step": 2530, + "time_per_iteration": 3.1210997104644775 + }, + { + "auxiliary_loss_clip": 0.015176, + "auxiliary_loss_mlp": 0.01327203, + "balance_loss_clip": 1.15835261, + "balance_loss_mlp": 1.03061068, + "epoch": 0.30433475620753925, + "flos": 17386138535040.0, + "grad_norm": 1.860848655900789, + "language_loss": 0.72604203, + "learning_rate": 3.261282429865899e-06, + "loss": 0.75449002, + "num_input_tokens_seen": 54466385, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.96484375, + "step": 2531, + "time_per_iteration": 3.0828380584716797 + }, + { + "auxiliary_loss_clip": 0.01523678, + "auxiliary_loss_mlp": 0.01313887, + "balance_loss_clip": 1.16258848, + "balance_loss_mlp": 1.02339745, + "epoch": 0.3044549990981783, + "flos": 18918982494720.0, + "grad_norm": 2.185465984243266, + "language_loss": 0.72209918, + "learning_rate": 3.2606777941674225e-06, + "loss": 0.75047481, + "num_input_tokens_seen": 54485040, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.90625, + "step": 2532, + "time_per_iteration": 3.0714924335479736 + }, + { + "auxiliary_loss_clip": 0.0151479, + "auxiliary_loss_mlp": 0.01321186, + "balance_loss_clip": 1.15427446, + "balance_loss_mlp": 1.02688253, + "epoch": 0.3045752419888174, + "flos": 21070516793760.0, + "grad_norm": 2.087834732660366, + "language_loss": 0.84405315, + "learning_rate": 3.2600729672251276e-06, + "loss": 0.87241292, + "num_input_tokens_seen": 54502755, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.94140625, + "step": 2533, + "time_per_iteration": 2.971780300140381 + }, + { + "auxiliary_loss_clip": 0.01519808, + "auxiliary_loss_mlp": 0.01329849, + "balance_loss_clip": 1.15845752, + "balance_loss_mlp": 1.03573537, + "epoch": 0.3046954848794565, + "flos": 29099002284000.0, + "grad_norm": 1.9660988736033915, + "language_loss": 0.65535438, + "learning_rate": 3.259467949130765e-06, + "loss": 0.68385094, + "num_input_tokens_seen": 54524165, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.93945312, + "step": 2534, + "time_per_iteration": 3.0753815174102783 + }, + { + "auxiliary_loss_clip": 0.01518334, + "auxiliary_loss_mlp": 0.01318023, + "balance_loss_clip": 1.15806043, + "balance_loss_mlp": 1.0233376, + "epoch": 0.3048157277700956, + "flos": 20297002248000.0, + "grad_norm": 2.6203260748538795, + "language_loss": 0.83271706, + "learning_rate": 3.2588627399761164e-06, + "loss": 0.86108065, + "num_input_tokens_seen": 54540160, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.9453125, + "step": 2535, + "time_per_iteration": 2.9857888221740723 + }, + { + "auxiliary_loss_clip": 0.01525359, + "auxiliary_loss_mlp": 0.01322128, + "balance_loss_clip": 1.16375756, + "balance_loss_mlp": 1.02706075, + "epoch": 0.3049359706607347, + "flos": 22741724138400.0, + "grad_norm": 3.434106017006557, + "language_loss": 0.71033311, + "learning_rate": 3.2582573398529903e-06, + "loss": 0.73880798, + "num_input_tokens_seen": 54557515, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.94921875, + "step": 2536, + "time_per_iteration": 2.977215528488159 + }, + { + "auxiliary_loss_clip": 0.01522492, + "auxiliary_loss_mlp": 0.0132555, + "balance_loss_clip": 1.16088867, + "balance_loss_mlp": 1.02895701, + "epoch": 0.3050562135513738, + "flos": 18436190209920.0, + "grad_norm": 5.755190323887027, + "language_loss": 0.74376655, + "learning_rate": 3.2576517488532265e-06, + "loss": 0.77224696, + "num_input_tokens_seen": 54573865, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.9609375, + "step": 2537, + "time_per_iteration": 2.9729115962982178 + }, + { + "auxiliary_loss_clip": 0.0151973, + "auxiliary_loss_mlp": 0.0131638, + "balance_loss_clip": 1.15754414, + "balance_loss_mlp": 1.02379298, + "epoch": 0.30517645644201286, + "flos": 20372100733440.0, + "grad_norm": 1.7219001478496052, + "language_loss": 0.87158906, + "learning_rate": 3.257045967068692e-06, + "loss": 0.89995021, + "num_input_tokens_seen": 54593120, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.92773438, + "step": 2538, + "time_per_iteration": 2.980158567428589 + }, + { + "auxiliary_loss_clip": 0.01521621, + "auxiliary_loss_mlp": 0.01334156, + "balance_loss_clip": 1.1608423, + "balance_loss_mlp": 1.04004335, + "epoch": 0.30529669933265197, + "flos": 21947310812160.0, + "grad_norm": 1.7033553732513105, + "language_loss": 0.81902611, + "learning_rate": 3.2564399945912848e-06, + "loss": 0.84758395, + "num_input_tokens_seen": 54612910, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.9375, + "step": 2539, + "time_per_iteration": 2.920025110244751 + }, + { + "auxiliary_loss_clip": 0.01520697, + "auxiliary_loss_mlp": 0.01323042, + "balance_loss_clip": 1.15938866, + "balance_loss_mlp": 1.03045487, + "epoch": 0.305416942223291, + "flos": 21837583552320.0, + "grad_norm": 2.5760262625049677, + "language_loss": 0.82118481, + "learning_rate": 3.2558338315129287e-06, + "loss": 0.84962219, + "num_input_tokens_seen": 54631055, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.92773438, + "step": 2540, + "time_per_iteration": 3.0111494064331055 + }, + { + "auxiliary_loss_clip": 0.01518876, + "auxiliary_loss_mlp": 0.01324154, + "balance_loss_clip": 1.15672553, + "balance_loss_mlp": 1.02832413, + "epoch": 0.30553718511393013, + "flos": 33914637136800.0, + "grad_norm": 2.1096909550659873, + "language_loss": 0.75820887, + "learning_rate": 3.2552274779255785e-06, + "loss": 0.78663921, + "num_input_tokens_seen": 54651985, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.95507812, + "step": 2541, + "time_per_iteration": 4.006465196609497 + }, + { + "auxiliary_loss_clip": 0.0151511, + "auxiliary_loss_mlp": 0.01326559, + "balance_loss_clip": 1.15281069, + "balance_loss_mlp": 1.03015733, + "epoch": 0.30565742800456924, + "flos": 22270462014240.0, + "grad_norm": 2.0751185908074388, + "language_loss": 0.76922351, + "learning_rate": 3.2546209339212184e-06, + "loss": 0.7976402, + "num_input_tokens_seen": 54671005, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.96289062, + "step": 2542, + "time_per_iteration": 4.0162153244018555 + }, + { + "auxiliary_loss_clip": 0.01520433, + "auxiliary_loss_mlp": 0.01321161, + "balance_loss_clip": 1.15870214, + "balance_loss_mlp": 1.02952695, + "epoch": 0.3057776708952083, + "flos": 22567252780800.0, + "grad_norm": 1.6574732864754207, + "language_loss": 0.77731991, + "learning_rate": 3.25401419959186e-06, + "loss": 0.80573583, + "num_input_tokens_seen": 54691615, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.91992188, + "step": 2543, + "time_per_iteration": 3.11307954788208 + }, + { + "auxiliary_loss_clip": 0.01526275, + "auxiliary_loss_mlp": 0.01335931, + "balance_loss_clip": 1.16531301, + "balance_loss_mlp": 1.03952885, + "epoch": 0.3058979137858474, + "flos": 21801399723360.0, + "grad_norm": 2.3110884302285926, + "language_loss": 0.76163882, + "learning_rate": 3.253407275029545e-06, + "loss": 0.79026091, + "num_input_tokens_seen": 54710520, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.9609375, + "step": 2544, + "time_per_iteration": 3.116424083709717 + }, + { + "auxiliary_loss_clip": 0.01525845, + "auxiliary_loss_mlp": 0.01320367, + "balance_loss_clip": 1.164505, + "balance_loss_mlp": 1.02530003, + "epoch": 0.3060181566764865, + "flos": 26981869190400.0, + "grad_norm": 2.5457164422753142, + "language_loss": 0.8008185, + "learning_rate": 3.2528001603263425e-06, + "loss": 0.82928061, + "num_input_tokens_seen": 54732590, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.94726562, + "step": 2545, + "time_per_iteration": 3.056065320968628 + }, + { + "auxiliary_loss_clip": 0.01525032, + "auxiliary_loss_mlp": 0.01334732, + "balance_loss_clip": 1.16346169, + "balance_loss_mlp": 1.0419544, + "epoch": 0.3061383995671256, + "flos": 19867158038880.0, + "grad_norm": 2.3539620498182097, + "language_loss": 0.81555128, + "learning_rate": 3.2521928555743514e-06, + "loss": 0.84414899, + "num_input_tokens_seen": 54749935, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.92578125, + "step": 2546, + "time_per_iteration": 3.086524486541748 + }, + { + "auxiliary_loss_clip": 0.01522388, + "auxiliary_loss_mlp": 0.01329171, + "balance_loss_clip": 1.16066241, + "balance_loss_mlp": 1.0337224, + "epoch": 0.3062586424577647, + "flos": 22129709155200.0, + "grad_norm": 1.761831455051869, + "language_loss": 0.67350179, + "learning_rate": 3.2515853608657e-06, + "loss": 0.70201743, + "num_input_tokens_seen": 54767935, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.953125, + "step": 2547, + "time_per_iteration": 3.8990447521209717 + }, + { + "auxiliary_loss_clip": 0.01530517, + "auxiliary_loss_mlp": 0.01329742, + "balance_loss_clip": 1.16938674, + "balance_loss_mlp": 1.03505683, + "epoch": 0.3063788853484038, + "flos": 20847345314400.0, + "grad_norm": 3.136445589070278, + "language_loss": 0.75275367, + "learning_rate": 3.250977676292545e-06, + "loss": 0.78135622, + "num_input_tokens_seen": 54786175, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.94335938, + "step": 2548, + "time_per_iteration": 3.901728391647339 + }, + { + "auxiliary_loss_clip": 0.01523264, + "auxiliary_loss_mlp": 0.01321654, + "balance_loss_clip": 1.16142225, + "balance_loss_mlp": 1.02925742, + "epoch": 0.30649912823904285, + "flos": 16211188336320.0, + "grad_norm": 2.3106164505346527, + "language_loss": 0.79456323, + "learning_rate": 3.2503698019470712e-06, + "loss": 0.82301247, + "num_input_tokens_seen": 54801945, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.92382812, + "step": 2549, + "time_per_iteration": 3.0070536136627197 + }, + { + "auxiliary_loss_clip": 0.01532804, + "auxiliary_loss_mlp": 0.01318621, + "balance_loss_clip": 1.16991889, + "balance_loss_mlp": 1.02469897, + "epoch": 0.30661937112968196, + "flos": 18619422972480.0, + "grad_norm": 2.448596425552678, + "language_loss": 0.78537142, + "learning_rate": 3.249761737921492e-06, + "loss": 0.81388569, + "num_input_tokens_seen": 54818475, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.93945312, + "step": 2550, + "time_per_iteration": 3.0292961597442627 + }, + { + "auxiliary_loss_clip": 0.0152062, + "auxiliary_loss_mlp": 0.01311081, + "balance_loss_clip": 1.15972865, + "balance_loss_mlp": 1.02192652, + "epoch": 0.30673961402032107, + "flos": 31393299634560.0, + "grad_norm": 2.1511630684126217, + "language_loss": 0.74431086, + "learning_rate": 3.249153484308051e-06, + "loss": 0.77262783, + "num_input_tokens_seen": 54837090, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.89453125, + "step": 2551, + "time_per_iteration": 3.1258316040039062 + }, + { + "auxiliary_loss_clip": 0.01523536, + "auxiliary_loss_mlp": 0.01318311, + "balance_loss_clip": 1.16402948, + "balance_loss_mlp": 1.02305377, + "epoch": 0.3068598569109601, + "flos": 20231878868640.0, + "grad_norm": 2.1190333035132842, + "language_loss": 0.77509218, + "learning_rate": 3.2485450411990194e-06, + "loss": 0.80351067, + "num_input_tokens_seen": 54856445, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.94921875, + "step": 2552, + "time_per_iteration": 3.0207159519195557 + }, + { + "auxiliary_loss_clip": 0.01527499, + "auxiliary_loss_mlp": 0.01319574, + "balance_loss_clip": 1.16761589, + "balance_loss_mlp": 1.02431643, + "epoch": 0.30698009980159924, + "flos": 29604324260160.0, + "grad_norm": 2.782474292081114, + "language_loss": 0.82943541, + "learning_rate": 3.2479364086866983e-06, + "loss": 0.8579061, + "num_input_tokens_seen": 54876700, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.95117188, + "step": 2553, + "time_per_iteration": 3.1285107135772705 + }, + { + "auxiliary_loss_clip": 0.01536045, + "auxiliary_loss_mlp": 0.0132771, + "balance_loss_clip": 1.17626238, + "balance_loss_mlp": 1.03226149, + "epoch": 0.30710034269223835, + "flos": 23844723888960.0, + "grad_norm": 1.9316509315434842, + "language_loss": 0.81373113, + "learning_rate": 3.247327586863416e-06, + "loss": 0.84236872, + "num_input_tokens_seen": 54897580, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.953125, + "step": 2554, + "time_per_iteration": 3.0396409034729004 + }, + { + "auxiliary_loss_clip": 0.01521471, + "auxiliary_loss_mlp": 0.01320125, + "balance_loss_clip": 1.16093647, + "balance_loss_mlp": 1.02524877, + "epoch": 0.3072205855828774, + "flos": 25887365347680.0, + "grad_norm": 2.6323178594277468, + "language_loss": 0.77676618, + "learning_rate": 3.2467185758215304e-06, + "loss": 0.80518216, + "num_input_tokens_seen": 54917320, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.94726562, + "step": 2555, + "time_per_iteration": 3.074007511138916 + }, + { + "auxiliary_loss_clip": 0.01526416, + "auxiliary_loss_mlp": 0.01330925, + "balance_loss_clip": 1.16738963, + "balance_loss_mlp": 1.03814745, + "epoch": 0.3073408284735165, + "flos": 22238564067360.0, + "grad_norm": 2.9423672045006137, + "language_loss": 0.85989642, + "learning_rate": 3.246109375653428e-06, + "loss": 0.88846982, + "num_input_tokens_seen": 54934085, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.92773438, + "step": 2556, + "time_per_iteration": 2.9633824825286865 + }, + { + "auxiliary_loss_clip": 0.01525483, + "auxiliary_loss_mlp": 0.01320602, + "balance_loss_clip": 1.16557503, + "balance_loss_mlp": 1.02801514, + "epoch": 0.30746107136415557, + "flos": 19502323424640.0, + "grad_norm": 1.8915991436776112, + "language_loss": 0.78527331, + "learning_rate": 3.2454999864515243e-06, + "loss": 0.81373417, + "num_input_tokens_seen": 54953460, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.92382812, + "step": 2557, + "time_per_iteration": 2.9850361347198486 + }, + { + "auxiliary_loss_clip": 0.01518949, + "auxiliary_loss_mlp": 0.01326305, + "balance_loss_clip": 1.16015899, + "balance_loss_mlp": 1.03295445, + "epoch": 0.3075813142547947, + "flos": 21726908088480.0, + "grad_norm": 2.0960630459935987, + "language_loss": 0.6970185, + "learning_rate": 3.244890408308263e-06, + "loss": 0.72547102, + "num_input_tokens_seen": 54974165, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.93164062, + "step": 2558, + "time_per_iteration": 3.007352590560913 + }, + { + "auxiliary_loss_clip": 0.01524008, + "auxiliary_loss_mlp": 0.01322618, + "balance_loss_clip": 1.16237092, + "balance_loss_mlp": 1.02774203, + "epoch": 0.3077015571454338, + "flos": 24100286381280.0, + "grad_norm": 2.3242796375403736, + "language_loss": 0.61229062, + "learning_rate": 3.2442806413161165e-06, + "loss": 0.64075691, + "num_input_tokens_seen": 54993810, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.94726562, + "step": 2559, + "time_per_iteration": 3.030123710632324 + }, + { + "auxiliary_loss_clip": 0.0152277, + "auxiliary_loss_mlp": 0.01326714, + "balance_loss_clip": 1.16316223, + "balance_loss_mlp": 1.03241003, + "epoch": 0.30782180003607285, + "flos": 18407440300320.0, + "grad_norm": 2.1015527503829174, + "language_loss": 0.76170182, + "learning_rate": 3.243670685567586e-06, + "loss": 0.79019672, + "num_input_tokens_seen": 55011210, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.94335938, + "step": 2560, + "time_per_iteration": 3.0600807666778564 + }, + { + "auxiliary_loss_clip": 0.01520436, + "auxiliary_loss_mlp": 0.01312536, + "balance_loss_clip": 1.16093779, + "balance_loss_mlp": 1.02166522, + "epoch": 0.30794204292671196, + "flos": 23880831861600.0, + "grad_norm": 2.2798568147014393, + "language_loss": 0.80171913, + "learning_rate": 3.2430605411552012e-06, + "loss": 0.83004892, + "num_input_tokens_seen": 55031325, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.91015625, + "step": 2561, + "time_per_iteration": 3.055786371231079 + }, + { + "auxiliary_loss_clip": 0.01631999, + "auxiliary_loss_mlp": 0.01215126, + "balance_loss_clip": 1.27192724, + "balance_loss_mlp": 0.98395538, + "epoch": 0.30806228581735107, + "flos": 67935208353120.0, + "grad_norm": 1.130686086791606, + "language_loss": 0.70603567, + "learning_rate": 3.2424502081715205e-06, + "loss": 0.73450696, + "num_input_tokens_seen": 55094440, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.3046875, + "step": 2562, + "time_per_iteration": 3.5206356048583984 + }, + { + "auxiliary_loss_clip": 0.01520245, + "auxiliary_loss_mlp": 0.0132714, + "balance_loss_clip": 1.15998363, + "balance_loss_mlp": 1.03207278, + "epoch": 0.3081825287079901, + "flos": 23845558308480.0, + "grad_norm": 1.7609034445487601, + "language_loss": 0.77660847, + "learning_rate": 3.241839686709132e-06, + "loss": 0.80508226, + "num_input_tokens_seen": 55115375, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.94921875, + "step": 2563, + "time_per_iteration": 3.1485166549682617 + }, + { + "auxiliary_loss_clip": 0.01522913, + "auxiliary_loss_mlp": 0.0133331, + "balance_loss_clip": 1.16242194, + "balance_loss_mlp": 1.03652608, + "epoch": 0.30830277159862923, + "flos": 16211453833440.0, + "grad_norm": 2.6577846582542914, + "language_loss": 0.82532692, + "learning_rate": 3.2412289768606495e-06, + "loss": 0.85388917, + "num_input_tokens_seen": 55131945, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.96679688, + "step": 2564, + "time_per_iteration": 2.9681894779205322 + }, + { + "auxiliary_loss_clip": 0.0152007, + "auxiliary_loss_mlp": 0.01339045, + "balance_loss_clip": 1.15931129, + "balance_loss_mlp": 1.04455078, + "epoch": 0.30842301448926834, + "flos": 29351985661440.0, + "grad_norm": 2.222114281757948, + "language_loss": 0.82889515, + "learning_rate": 3.240618078718718e-06, + "loss": 0.85748631, + "num_input_tokens_seen": 55153405, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.94140625, + "step": 2565, + "time_per_iteration": 3.0800113677978516 + }, + { + "auxiliary_loss_clip": 0.01518406, + "auxiliary_loss_mlp": 0.01335594, + "balance_loss_clip": 1.15864801, + "balance_loss_mlp": 1.03861976, + "epoch": 0.3085432573799074, + "flos": 21947386668480.0, + "grad_norm": 1.8652470315311538, + "language_loss": 0.74495411, + "learning_rate": 3.240006992376011e-06, + "loss": 0.77349412, + "num_input_tokens_seen": 55173030, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.96484375, + "step": 2566, + "time_per_iteration": 3.093096971511841 + }, + { + "auxiliary_loss_clip": 0.01523392, + "auxiliary_loss_mlp": 0.01334022, + "balance_loss_clip": 1.16330242, + "balance_loss_mlp": 1.0393368, + "epoch": 0.3086635002705465, + "flos": 22056924287520.0, + "grad_norm": 4.422129317241721, + "language_loss": 0.7639904, + "learning_rate": 3.2393957179252284e-06, + "loss": 0.79256457, + "num_input_tokens_seen": 55189565, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.94335938, + "step": 2567, + "time_per_iteration": 3.036785840988159 + }, + { + "auxiliary_loss_clip": 0.01521867, + "auxiliary_loss_mlp": 0.0132672, + "balance_loss_clip": 1.16195393, + "balance_loss_mlp": 1.03127217, + "epoch": 0.3087837431611856, + "flos": 32668381268640.0, + "grad_norm": 1.9375601726793865, + "language_loss": 0.80780602, + "learning_rate": 3.2387842554591016e-06, + "loss": 0.83629185, + "num_input_tokens_seen": 55210380, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.95117188, + "step": 2568, + "time_per_iteration": 3.0991339683532715 + }, + { + "auxiliary_loss_clip": 0.01525669, + "auxiliary_loss_mlp": 0.01326705, + "balance_loss_clip": 1.16521847, + "balance_loss_mlp": 1.03163838, + "epoch": 0.3089039860518247, + "flos": 17600965819200.0, + "grad_norm": 2.3413917425718442, + "language_loss": 0.87925994, + "learning_rate": 3.238172605070388e-06, + "loss": 0.90778369, + "num_input_tokens_seen": 55225795, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.94921875, + "step": 2569, + "time_per_iteration": 4.765387535095215 + }, + { + "auxiliary_loss_clip": 0.01521282, + "auxiliary_loss_mlp": 0.01332101, + "balance_loss_clip": 1.16204894, + "balance_loss_mlp": 1.03321993, + "epoch": 0.3090242289424638, + "flos": 14385118857120.0, + "grad_norm": 3.253253296353412, + "language_loss": 0.78855598, + "learning_rate": 3.2375607668518745e-06, + "loss": 0.8170898, + "num_input_tokens_seen": 55238830, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.984375, + "step": 2570, + "time_per_iteration": 2.9636728763580322 + }, + { + "auxiliary_loss_clip": 0.01528694, + "auxiliary_loss_mlp": 0.01329399, + "balance_loss_clip": 1.16911983, + "balance_loss_mlp": 1.03471422, + "epoch": 0.30914447183310284, + "flos": 16070359620960.0, + "grad_norm": 2.326271219562318, + "language_loss": 0.90115762, + "learning_rate": 3.236948740896377e-06, + "loss": 0.92973864, + "num_input_tokens_seen": 55253630, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.94335938, + "step": 2571, + "time_per_iteration": 2.998182535171509 + }, + { + "auxiliary_loss_clip": 0.01524343, + "auxiliary_loss_mlp": 0.01349228, + "balance_loss_clip": 1.16536248, + "balance_loss_mlp": 1.04958403, + "epoch": 0.30926471472374195, + "flos": 32232771979200.0, + "grad_norm": 1.652538655721959, + "language_loss": 0.84273648, + "learning_rate": 3.2363365272967384e-06, + "loss": 0.87147224, + "num_input_tokens_seen": 55276200, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.99023438, + "step": 2572, + "time_per_iteration": 3.386817216873169 + }, + { + "auxiliary_loss_clip": 0.01523341, + "auxiliary_loss_mlp": 0.01337624, + "balance_loss_clip": 1.16255701, + "balance_loss_mlp": 1.04293871, + "epoch": 0.30938495761438106, + "flos": 20372517943200.0, + "grad_norm": 1.9323736450884714, + "language_loss": 0.81533527, + "learning_rate": 3.235724126145832e-06, + "loss": 0.84394491, + "num_input_tokens_seen": 55292235, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.9453125, + "step": 2573, + "time_per_iteration": 2.9515745639801025 + }, + { + "auxiliary_loss_clip": 0.01523301, + "auxiliary_loss_mlp": 0.01338165, + "balance_loss_clip": 1.16377664, + "balance_loss_mlp": 1.04462433, + "epoch": 0.3095052005050201, + "flos": 24063457773600.0, + "grad_norm": 1.614626185632394, + "language_loss": 0.77750707, + "learning_rate": 3.235111537536558e-06, + "loss": 0.80612171, + "num_input_tokens_seen": 55313050, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.93359375, + "step": 2574, + "time_per_iteration": 3.8473923206329346 + }, + { + "auxiliary_loss_clip": 0.01520165, + "auxiliary_loss_mlp": 0.01321756, + "balance_loss_clip": 1.16022372, + "balance_loss_mlp": 1.02840614, + "epoch": 0.30962544339565923, + "flos": 23403652944480.0, + "grad_norm": 1.7610896681704193, + "language_loss": 0.82896298, + "learning_rate": 3.2344987615618456e-06, + "loss": 0.85738224, + "num_input_tokens_seen": 55332885, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.93359375, + "step": 2575, + "time_per_iteration": 3.8204972743988037 + }, + { + "auxiliary_loss_clip": 0.01524353, + "auxiliary_loss_mlp": 0.0132569, + "balance_loss_clip": 1.16457105, + "balance_loss_mlp": 1.03214908, + "epoch": 0.30974568628629834, + "flos": 33802672115520.0, + "grad_norm": 1.713663644326917, + "language_loss": 0.78490973, + "learning_rate": 3.2338857983146533e-06, + "loss": 0.81341016, + "num_input_tokens_seen": 55354385, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.93359375, + "step": 2576, + "time_per_iteration": 3.0517022609710693 + }, + { + "auxiliary_loss_clip": 0.01533449, + "auxiliary_loss_mlp": 0.01337875, + "balance_loss_clip": 1.17494905, + "balance_loss_mlp": 1.04090047, + "epoch": 0.3098659291769374, + "flos": 20231651299680.0, + "grad_norm": 1.8444835869584748, + "language_loss": 0.76299453, + "learning_rate": 3.233272647887966e-06, + "loss": 0.79170775, + "num_input_tokens_seen": 55373275, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.96679688, + "step": 2577, + "time_per_iteration": 3.0334367752075195 + }, + { + "auxiliary_loss_clip": 0.01531428, + "auxiliary_loss_mlp": 0.01336037, + "balance_loss_clip": 1.17303371, + "balance_loss_mlp": 1.03982568, + "epoch": 0.3099861720675765, + "flos": 24750571242240.0, + "grad_norm": 1.6347727415376434, + "language_loss": 0.90111494, + "learning_rate": 3.2326593103747985e-06, + "loss": 0.9297896, + "num_input_tokens_seen": 55392290, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.9609375, + "step": 2578, + "time_per_iteration": 3.0385355949401855 + }, + { + "auxiliary_loss_clip": 0.01531758, + "auxiliary_loss_mlp": 0.01328461, + "balance_loss_clip": 1.17320323, + "balance_loss_mlp": 1.03587377, + "epoch": 0.3101064149582156, + "flos": 11767101382080.0, + "grad_norm": 1.9650042130817507, + "language_loss": 0.85095525, + "learning_rate": 3.2320457858681936e-06, + "loss": 0.87955743, + "num_input_tokens_seen": 55410680, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.92578125, + "step": 2579, + "time_per_iteration": 3.020940065383911 + }, + { + "auxiliary_loss_clip": 0.01530582, + "auxiliary_loss_mlp": 0.01323293, + "balance_loss_clip": 1.171381, + "balance_loss_mlp": 1.0331856, + "epoch": 0.31022665784885467, + "flos": 23035139298720.0, + "grad_norm": 4.409513592456886, + "language_loss": 0.85878932, + "learning_rate": 3.2314320744612228e-06, + "loss": 0.88732815, + "num_input_tokens_seen": 55425980, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.90429688, + "step": 2580, + "time_per_iteration": 3.016028642654419 + }, + { + "auxiliary_loss_clip": 0.01526966, + "auxiliary_loss_mlp": 0.01319023, + "balance_loss_clip": 1.16890335, + "balance_loss_mlp": 1.02719843, + "epoch": 0.3103469007394938, + "flos": 16291027841760.0, + "grad_norm": 1.9728762115555838, + "language_loss": 0.76775849, + "learning_rate": 3.2308181762469854e-06, + "loss": 0.7962184, + "num_input_tokens_seen": 55443925, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.91796875, + "step": 2581, + "time_per_iteration": 3.0177955627441406 + }, + { + "auxiliary_loss_clip": 0.01526395, + "auxiliary_loss_mlp": 0.0132896, + "balance_loss_clip": 1.16857314, + "balance_loss_mlp": 1.03332067, + "epoch": 0.3104671436301329, + "flos": 30517719317280.0, + "grad_norm": 2.1972583649291706, + "language_loss": 0.7844575, + "learning_rate": 3.230204091318609e-06, + "loss": 0.81301105, + "num_input_tokens_seen": 55464465, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.95703125, + "step": 2582, + "time_per_iteration": 3.069847583770752 + }, + { + "auxiliary_loss_clip": 0.01523763, + "auxiliary_loss_mlp": 0.01322318, + "balance_loss_clip": 1.16534221, + "balance_loss_mlp": 1.03144705, + "epoch": 0.31058738652077195, + "flos": 20049290884800.0, + "grad_norm": 4.505509142440538, + "language_loss": 0.84826696, + "learning_rate": 3.2295898197692503e-06, + "loss": 0.87672782, + "num_input_tokens_seen": 55483425, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.91015625, + "step": 2583, + "time_per_iteration": 3.0355703830718994 + }, + { + "auxiliary_loss_clip": 0.01531236, + "auxiliary_loss_mlp": 0.01337224, + "balance_loss_clip": 1.17299199, + "balance_loss_mlp": 1.04501843, + "epoch": 0.31070762941141106, + "flos": 28077169524480.0, + "grad_norm": 1.8732853603783808, + "language_loss": 0.79549372, + "learning_rate": 3.228975361692094e-06, + "loss": 0.82417828, + "num_input_tokens_seen": 55504445, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.92382812, + "step": 2584, + "time_per_iteration": 2.9969871044158936 + }, + { + "auxiliary_loss_clip": 0.01525757, + "auxiliary_loss_mlp": 0.01333256, + "balance_loss_clip": 1.1691674, + "balance_loss_mlp": 1.03837967, + "epoch": 0.31082787230205017, + "flos": 20524004471520.0, + "grad_norm": 2.2487842055615603, + "language_loss": 0.79890531, + "learning_rate": 3.228360717180352e-06, + "loss": 0.82749546, + "num_input_tokens_seen": 55521970, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.9453125, + "step": 2585, + "time_per_iteration": 3.041482448577881 + }, + { + "auxiliary_loss_clip": 0.01600295, + "auxiliary_loss_mlp": 0.01386478, + "balance_loss_clip": 1.24133253, + "balance_loss_mlp": 1.14462662, + "epoch": 0.3109481151926892, + "flos": 62452524392640.0, + "grad_norm": 0.8715302561621001, + "language_loss": 0.59350693, + "learning_rate": 3.227745886327266e-06, + "loss": 0.6233747, + "num_input_tokens_seen": 55580665, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.4140625, + "step": 2586, + "time_per_iteration": 3.3930165767669678 + }, + { + "auxiliary_loss_clip": 0.01597802, + "auxiliary_loss_mlp": 0.01333572, + "balance_loss_clip": 1.23946047, + "balance_loss_mlp": 1.09324646, + "epoch": 0.31106835808332833, + "flos": 44752147773120.0, + "grad_norm": 0.8617350363960538, + "language_loss": 0.55794394, + "learning_rate": 3.227130869226105e-06, + "loss": 0.58725762, + "num_input_tokens_seen": 55637825, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.3984375, + "step": 2587, + "time_per_iteration": 3.3643126487731934 + }, + { + "auxiliary_loss_clip": 0.01532543, + "auxiliary_loss_mlp": 0.01315195, + "balance_loss_clip": 1.1746012, + "balance_loss_mlp": 1.02623224, + "epoch": 0.3111886009739674, + "flos": 23405245927200.0, + "grad_norm": 2.491431734364211, + "language_loss": 0.82931167, + "learning_rate": 3.226515665970167e-06, + "loss": 0.8577891, + "num_input_tokens_seen": 55655365, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.89648438, + "step": 2588, + "time_per_iteration": 3.1146137714385986 + }, + { + "auxiliary_loss_clip": 0.01526937, + "auxiliary_loss_mlp": 0.01313195, + "balance_loss_clip": 1.16855145, + "balance_loss_mlp": 1.02156138, + "epoch": 0.3113088438646065, + "flos": 17532694402560.0, + "grad_norm": 2.152324196860715, + "language_loss": 0.86533248, + "learning_rate": 3.225900276652777e-06, + "loss": 0.89373386, + "num_input_tokens_seen": 55672140, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.91796875, + "step": 2589, + "time_per_iteration": 2.9303507804870605 + }, + { + "auxiliary_loss_clip": 0.01528285, + "auxiliary_loss_mlp": 0.0133105, + "balance_loss_clip": 1.17095947, + "balance_loss_mlp": 1.04342198, + "epoch": 0.3114290867552456, + "flos": 28368308995200.0, + "grad_norm": 1.581784098761534, + "language_loss": 0.75287741, + "learning_rate": 3.2252847013672906e-06, + "loss": 0.78147078, + "num_input_tokens_seen": 55694800, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.8828125, + "step": 2590, + "time_per_iteration": 3.0891354084014893 + }, + { + "auxiliary_loss_clip": 0.01530462, + "auxiliary_loss_mlp": 0.01323875, + "balance_loss_clip": 1.17276502, + "balance_loss_mlp": 1.03891683, + "epoch": 0.31154932964588467, + "flos": 27381522219840.0, + "grad_norm": 3.7156678738938527, + "language_loss": 0.75902963, + "learning_rate": 3.224668940207089e-06, + "loss": 0.78757298, + "num_input_tokens_seen": 55713785, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.85546875, + "step": 2591, + "time_per_iteration": 2.998724937438965 + }, + { + "auxiliary_loss_clip": 0.01526789, + "auxiliary_loss_mlp": 0.01332978, + "balance_loss_clip": 1.16884112, + "balance_loss_mlp": 1.0432514, + "epoch": 0.3116695725365238, + "flos": 26544287636640.0, + "grad_norm": 2.060526742150488, + "language_loss": 0.87415779, + "learning_rate": 3.2240529932655828e-06, + "loss": 0.90275538, + "num_input_tokens_seen": 55733050, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.90234375, + "step": 2592, + "time_per_iteration": 2.9671788215637207 + }, + { + "auxiliary_loss_clip": 0.01530403, + "auxiliary_loss_mlp": 0.01352002, + "balance_loss_clip": 1.1729821, + "balance_loss_mlp": 1.06227577, + "epoch": 0.3117898154271629, + "flos": 21179675131200.0, + "grad_norm": 3.5105462219277155, + "language_loss": 0.8861981, + "learning_rate": 3.223436860636211e-06, + "loss": 0.91502213, + "num_input_tokens_seen": 55748685, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.8984375, + "step": 2593, + "time_per_iteration": 2.9258835315704346 + }, + { + "auxiliary_loss_clip": 0.01523348, + "auxiliary_loss_mlp": 0.01327639, + "balance_loss_clip": 1.16542363, + "balance_loss_mlp": 1.03734016, + "epoch": 0.31191005831780194, + "flos": 27274942997280.0, + "grad_norm": 1.7782506063551746, + "language_loss": 0.74182683, + "learning_rate": 3.2228205424124403e-06, + "loss": 0.77033663, + "num_input_tokens_seen": 55771840, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.90429688, + "step": 2594, + "time_per_iteration": 2.9638209342956543 + }, + { + "auxiliary_loss_clip": 0.01520285, + "auxiliary_loss_mlp": 0.01328266, + "balance_loss_clip": 1.16329658, + "balance_loss_mlp": 1.0394932, + "epoch": 0.31203030120844105, + "flos": 12964846769280.0, + "grad_norm": 2.817212623555358, + "language_loss": 0.74366426, + "learning_rate": 3.222204038687765e-06, + "loss": 0.77214974, + "num_input_tokens_seen": 55784975, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.890625, + "step": 2595, + "time_per_iteration": 2.9400904178619385 + }, + { + "auxiliary_loss_clip": 0.01521273, + "auxiliary_loss_mlp": 0.01335616, + "balance_loss_clip": 1.16332519, + "balance_loss_mlp": 1.04760623, + "epoch": 0.31215054409908016, + "flos": 27564413628960.0, + "grad_norm": 2.485454447611846, + "language_loss": 0.88230383, + "learning_rate": 3.221587349555709e-06, + "loss": 0.9108727, + "num_input_tokens_seen": 55805235, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.88476562, + "step": 2596, + "time_per_iteration": 3.8102571964263916 + }, + { + "auxiliary_loss_clip": 0.01520836, + "auxiliary_loss_mlp": 0.01336572, + "balance_loss_clip": 1.16438103, + "balance_loss_mlp": 1.04799044, + "epoch": 0.3122707869897192, + "flos": 21508439700960.0, + "grad_norm": 1.873700666312009, + "language_loss": 0.69749284, + "learning_rate": 3.2209704751098236e-06, + "loss": 0.72606689, + "num_input_tokens_seen": 55824265, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.890625, + "step": 2597, + "time_per_iteration": 3.852259397506714 + }, + { + "auxiliary_loss_clip": 0.01522839, + "auxiliary_loss_mlp": 0.0133502, + "balance_loss_clip": 1.164464, + "balance_loss_mlp": 1.04319537, + "epoch": 0.31239102988035833, + "flos": 15188369444640.0, + "grad_norm": 2.2197760292667112, + "language_loss": 0.82550794, + "learning_rate": 3.2203534154436875e-06, + "loss": 0.85408652, + "num_input_tokens_seen": 55838620, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.91992188, + "step": 2598, + "time_per_iteration": 2.988595724105835 + }, + { + "auxiliary_loss_clip": 0.01523853, + "auxiliary_loss_mlp": 0.01335077, + "balance_loss_clip": 1.16660142, + "balance_loss_mlp": 1.04573178, + "epoch": 0.31251127277099744, + "flos": 22055862299040.0, + "grad_norm": 1.8596172013798837, + "language_loss": 0.75659508, + "learning_rate": 3.2197361706509084e-06, + "loss": 0.78518438, + "num_input_tokens_seen": 55859375, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.89648438, + "step": 2599, + "time_per_iteration": 3.1245408058166504 + }, + { + "auxiliary_loss_clip": 0.0152437, + "auxiliary_loss_mlp": 0.01334035, + "balance_loss_clip": 1.1661768, + "balance_loss_mlp": 1.04068446, + "epoch": 0.3126315156616365, + "flos": 15195727507680.0, + "grad_norm": 2.8766291397812718, + "language_loss": 0.83973604, + "learning_rate": 3.2191187408251228e-06, + "loss": 0.86832011, + "num_input_tokens_seen": 55876535, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.93554688, + "step": 2600, + "time_per_iteration": 3.0123062133789062 + }, + { + "auxiliary_loss_clip": 0.01521095, + "auxiliary_loss_mlp": 0.01342834, + "balance_loss_clip": 1.1631664, + "balance_loss_mlp": 1.0464319, + "epoch": 0.3127517585522756, + "flos": 18147023003520.0, + "grad_norm": 2.469182081691859, + "language_loss": 0.78637338, + "learning_rate": 3.218501126059993e-06, + "loss": 0.81501269, + "num_input_tokens_seen": 55891930, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.9609375, + "step": 2601, + "time_per_iteration": 3.9433629512786865 + }, + { + "auxiliary_loss_clip": 0.01520517, + "auxiliary_loss_mlp": 0.01325442, + "balance_loss_clip": 1.16353321, + "balance_loss_mlp": 1.03514409, + "epoch": 0.31287200144291466, + "flos": 21910823557920.0, + "grad_norm": 1.9022704958713876, + "language_loss": 0.81769729, + "learning_rate": 3.2178833264492116e-06, + "loss": 0.84615684, + "num_input_tokens_seen": 55910635, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.90625, + "step": 2602, + "time_per_iteration": 3.858285903930664 + }, + { + "auxiliary_loss_clip": 0.01529148, + "auxiliary_loss_mlp": 0.01329394, + "balance_loss_clip": 1.17182732, + "balance_loss_mlp": 1.03661585, + "epoch": 0.31299224433355377, + "flos": 29899522044000.0, + "grad_norm": 1.9449728061969154, + "language_loss": 0.76249361, + "learning_rate": 3.217265342086498e-06, + "loss": 0.7910791, + "num_input_tokens_seen": 55931125, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.92578125, + "step": 2603, + "time_per_iteration": 3.030470609664917 + }, + { + "auxiliary_loss_clip": 0.01525408, + "auxiliary_loss_mlp": 0.01329208, + "balance_loss_clip": 1.16909778, + "balance_loss_mlp": 1.0343318, + "epoch": 0.3131124872241929, + "flos": 11657487906720.0, + "grad_norm": 3.5853180671749842, + "language_loss": 0.73087043, + "learning_rate": 3.216647173065599e-06, + "loss": 0.75941664, + "num_input_tokens_seen": 55946590, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.94726562, + "step": 2604, + "time_per_iteration": 2.9356842041015625 + }, + { + "auxiliary_loss_clip": 0.01525868, + "auxiliary_loss_mlp": 0.01331449, + "balance_loss_clip": 1.16693354, + "balance_loss_mlp": 1.03752708, + "epoch": 0.31323273011483194, + "flos": 49852095392160.0, + "grad_norm": 1.8307031888441472, + "language_loss": 0.7369374, + "learning_rate": 3.216028819480292e-06, + "loss": 0.76551056, + "num_input_tokens_seen": 55967930, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.93945312, + "step": 2605, + "time_per_iteration": 3.2681026458740234 + }, + { + "auxiliary_loss_clip": 0.0152997, + "auxiliary_loss_mlp": 0.01331202, + "balance_loss_clip": 1.17305946, + "balance_loss_mlp": 1.03918707, + "epoch": 0.31335297300547105, + "flos": 22603474537920.0, + "grad_norm": 2.382749118933418, + "language_loss": 0.75430357, + "learning_rate": 3.2154102814243793e-06, + "loss": 0.78291523, + "num_input_tokens_seen": 55987070, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.91992188, + "step": 2606, + "time_per_iteration": 2.927800178527832 + }, + { + "auxiliary_loss_clip": 0.01528708, + "auxiliary_loss_mlp": 0.01339679, + "balance_loss_clip": 1.1717782, + "balance_loss_mlp": 1.04499328, + "epoch": 0.31347321589611016, + "flos": 34713525985920.0, + "grad_norm": 2.280136837583912, + "language_loss": 0.67001927, + "learning_rate": 3.2147915589916937e-06, + "loss": 0.69870323, + "num_input_tokens_seen": 56008630, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.9453125, + "step": 2607, + "time_per_iteration": 3.0537168979644775 + }, + { + "auxiliary_loss_clip": 0.0152968, + "auxiliary_loss_mlp": 0.01323587, + "balance_loss_clip": 1.17329049, + "balance_loss_mlp": 1.03023648, + "epoch": 0.3135934587867492, + "flos": 19757847988800.0, + "grad_norm": 1.9242387748604748, + "language_loss": 0.82977128, + "learning_rate": 3.2141726522760938e-06, + "loss": 0.8583039, + "num_input_tokens_seen": 56026690, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.93164062, + "step": 2608, + "time_per_iteration": 2.953141212463379 + }, + { + "auxiliary_loss_clip": 0.0155531, + "auxiliary_loss_mlp": 0.01522057, + "balance_loss_clip": 1.2014122, + "balance_loss_mlp": 1.29851532, + "epoch": 0.3137137016773883, + "flos": 65823043848480.0, + "grad_norm": 1.0834612985286531, + "language_loss": 0.52621293, + "learning_rate": 3.213553561371469e-06, + "loss": 0.55698663, + "num_input_tokens_seen": 56090425, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.2421875, + "step": 2609, + "time_per_iteration": 3.5300028324127197 + }, + { + "auxiliary_loss_clip": 0.01529013, + "auxiliary_loss_mlp": 0.01352064, + "balance_loss_clip": 1.17155457, + "balance_loss_mlp": 1.05814219, + "epoch": 0.31383394456802743, + "flos": 16254578515680.0, + "grad_norm": 2.663363240417456, + "language_loss": 0.96184391, + "learning_rate": 3.212934286371733e-06, + "loss": 0.99065471, + "num_input_tokens_seen": 56107135, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.93945312, + "step": 2610, + "time_per_iteration": 3.0719823837280273 + }, + { + "auxiliary_loss_clip": 0.01535278, + "auxiliary_loss_mlp": 0.01374757, + "balance_loss_clip": 1.17941785, + "balance_loss_mlp": 1.07396841, + "epoch": 0.3139541874586665, + "flos": 38798315837280.0, + "grad_norm": 2.4153578807006726, + "language_loss": 0.83314162, + "learning_rate": 3.2123148273708304e-06, + "loss": 0.86224198, + "num_input_tokens_seen": 56127325, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 3.00195312, + "step": 2611, + "time_per_iteration": 3.1427652835845947 + }, + { + "auxiliary_loss_clip": 0.0153185, + "auxiliary_loss_mlp": 0.01400726, + "balance_loss_clip": 1.17492294, + "balance_loss_mlp": 1.10260701, + "epoch": 0.3140744303493056, + "flos": 25048689494400.0, + "grad_norm": 1.968341746523796, + "language_loss": 0.76958764, + "learning_rate": 3.211695184462733e-06, + "loss": 0.79891336, + "num_input_tokens_seen": 56148500, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.97460938, + "step": 2612, + "time_per_iteration": 2.9325850009918213 + }, + { + "auxiliary_loss_clip": 0.01548515, + "auxiliary_loss_mlp": 0.01230064, + "balance_loss_clip": 1.19380486, + "balance_loss_mlp": 0.99507904, + "epoch": 0.3141946732399447, + "flos": 72510755767200.0, + "grad_norm": 0.8926498481310993, + "language_loss": 0.60480952, + "learning_rate": 3.2110753577414383e-06, + "loss": 0.6325953, + "num_input_tokens_seen": 56210080, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.34375, + "step": 2613, + "time_per_iteration": 3.399953603744507 + }, + { + "auxiliary_loss_clip": 0.01524313, + "auxiliary_loss_mlp": 0.0143014, + "balance_loss_clip": 1.16792119, + "balance_loss_mlp": 1.12610888, + "epoch": 0.31431491613058377, + "flos": 19241526846240.0, + "grad_norm": 1.9548361650773338, + "language_loss": 0.78865248, + "learning_rate": 3.2104553473009757e-06, + "loss": 0.81819701, + "num_input_tokens_seen": 56228200, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 3.03710938, + "step": 2614, + "time_per_iteration": 2.9087064266204834 + }, + { + "auxiliary_loss_clip": 0.01525367, + "auxiliary_loss_mlp": 0.01439238, + "balance_loss_clip": 1.16957986, + "balance_loss_mlp": 1.1365416, + "epoch": 0.3144351590212229, + "flos": 36213561722880.0, + "grad_norm": 2.062892146187529, + "language_loss": 0.67976844, + "learning_rate": 3.209835153235399e-06, + "loss": 0.70941448, + "num_input_tokens_seen": 56249755, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 3.02148438, + "step": 2615, + "time_per_iteration": 3.106436014175415 + }, + { + "auxiliary_loss_clip": 0.0153019, + "auxiliary_loss_mlp": 0.01425439, + "balance_loss_clip": 1.1734879, + "balance_loss_mlp": 1.12293315, + "epoch": 0.314555401911862, + "flos": 18553730670720.0, + "grad_norm": 2.00135077174969, + "language_loss": 0.67824334, + "learning_rate": 3.2092147756387916e-06, + "loss": 0.70779967, + "num_input_tokens_seen": 56270080, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 3.01953125, + "step": 2616, + "time_per_iteration": 3.01802134513855 + }, + { + "auxiliary_loss_clip": 0.01534925, + "auxiliary_loss_mlp": 0.01409402, + "balance_loss_clip": 1.17938125, + "balance_loss_mlp": 1.10575223, + "epoch": 0.31467564480250104, + "flos": 16364760913440.0, + "grad_norm": 1.7954688857743448, + "language_loss": 0.83708084, + "learning_rate": 3.208594214605264e-06, + "loss": 0.8665241, + "num_input_tokens_seen": 56288625, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 3.03125, + "step": 2617, + "time_per_iteration": 3.0166573524475098 + }, + { + "auxiliary_loss_clip": 0.01522559, + "auxiliary_loss_mlp": 0.01402963, + "balance_loss_clip": 1.16677809, + "balance_loss_mlp": 1.10255527, + "epoch": 0.31479588769314015, + "flos": 21654502502400.0, + "grad_norm": 2.2139540805217552, + "language_loss": 0.77780038, + "learning_rate": 3.2079734702289553e-06, + "loss": 0.80705559, + "num_input_tokens_seen": 56307520, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.99804688, + "step": 2618, + "time_per_iteration": 3.09822154045105 + }, + { + "auxiliary_loss_clip": 0.01533687, + "auxiliary_loss_mlp": 0.01366768, + "balance_loss_clip": 1.18014908, + "balance_loss_mlp": 1.1195755, + "epoch": 0.3149161305837792, + "flos": 66057480355680.0, + "grad_norm": 1.1212078692752454, + "language_loss": 0.60396165, + "learning_rate": 3.207352542604031e-06, + "loss": 0.63296622, + "num_input_tokens_seen": 56369855, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.46875, + "step": 2619, + "time_per_iteration": 3.5880630016326904 + }, + { + "auxiliary_loss_clip": 0.01526778, + "auxiliary_loss_mlp": 0.01364377, + "balance_loss_clip": 1.17162132, + "balance_loss_mlp": 1.06873786, + "epoch": 0.3150363734744183, + "flos": 28989806018400.0, + "grad_norm": 1.5385814503760717, + "language_loss": 0.78581816, + "learning_rate": 3.2067314318246864e-06, + "loss": 0.81472969, + "num_input_tokens_seen": 56390570, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.95117188, + "step": 2620, + "time_per_iteration": 3.0261924266815186 + }, + { + "auxiliary_loss_clip": 0.01528942, + "auxiliary_loss_mlp": 0.01350366, + "balance_loss_clip": 1.17393112, + "balance_loss_mlp": 1.0533917, + "epoch": 0.31515661636505743, + "flos": 27639094904640.0, + "grad_norm": 1.8419302862655784, + "language_loss": 0.77974629, + "learning_rate": 3.206110137985143e-06, + "loss": 0.80853939, + "num_input_tokens_seen": 56410775, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.96679688, + "step": 2621, + "time_per_iteration": 3.1760196685791016 + }, + { + "auxiliary_loss_clip": 0.01519376, + "auxiliary_loss_mlp": 0.01338839, + "balance_loss_clip": 1.16369987, + "balance_loss_mlp": 1.04052985, + "epoch": 0.3152768592556965, + "flos": 24607808190720.0, + "grad_norm": 2.236872790911308, + "language_loss": 0.92303395, + "learning_rate": 3.2054886611796505e-06, + "loss": 0.95161605, + "num_input_tokens_seen": 56429770, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.9765625, + "step": 2622, + "time_per_iteration": 2.977529764175415 + }, + { + "auxiliary_loss_clip": 0.01523942, + "auxiliary_loss_mlp": 0.0126461, + "balance_loss_clip": 1.17138648, + "balance_loss_mlp": 1.02657318, + "epoch": 0.3153971021463356, + "flos": 68482403746560.0, + "grad_norm": 0.9014090323649268, + "language_loss": 0.63518929, + "learning_rate": 3.204867001502487e-06, + "loss": 0.66307485, + "num_input_tokens_seen": 56488425, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.375, + "step": 2623, + "time_per_iteration": 3.4094464778900146 + }, + { + "auxiliary_loss_clip": 0.01524522, + "auxiliary_loss_mlp": 0.01329292, + "balance_loss_clip": 1.16939306, + "balance_loss_mlp": 1.03594208, + "epoch": 0.3155173450369747, + "flos": 25596112092480.0, + "grad_norm": 1.9160623659861988, + "language_loss": 0.80988836, + "learning_rate": 3.2042451590479567e-06, + "loss": 0.83842647, + "num_input_tokens_seen": 56508940, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.93164062, + "step": 2624, + "time_per_iteration": 3.8218328952789307 + }, + { + "auxiliary_loss_clip": 0.01521113, + "auxiliary_loss_mlp": 0.013267, + "balance_loss_clip": 1.16599131, + "balance_loss_mlp": 1.03544772, + "epoch": 0.31563758792761376, + "flos": 24311244993120.0, + "grad_norm": 2.3479382986986894, + "language_loss": 0.87039256, + "learning_rate": 3.203623133910394e-06, + "loss": 0.89887071, + "num_input_tokens_seen": 56527245, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.91210938, + "step": 2625, + "time_per_iteration": 3.860792636871338 + }, + { + "auxiliary_loss_clip": 0.01522222, + "auxiliary_loss_mlp": 0.01344293, + "balance_loss_clip": 1.16662145, + "balance_loss_mlp": 1.05189633, + "epoch": 0.31575783081825287, + "flos": 31906510668000.0, + "grad_norm": 11.624990073233255, + "language_loss": 0.77766633, + "learning_rate": 3.203000926184158e-06, + "loss": 0.8063314, + "num_input_tokens_seen": 56546170, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.92578125, + "step": 2626, + "time_per_iteration": 3.0035719871520996 + }, + { + "auxiliary_loss_clip": 0.01519543, + "auxiliary_loss_mlp": 0.0132757, + "balance_loss_clip": 1.1652832, + "balance_loss_mlp": 1.03956079, + "epoch": 0.315878073708892, + "flos": 30813637736160.0, + "grad_norm": 2.3130465566262965, + "language_loss": 0.77855134, + "learning_rate": 3.202378535963639e-06, + "loss": 0.80702245, + "num_input_tokens_seen": 56567085, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.8828125, + "step": 2627, + "time_per_iteration": 3.0145254135131836 + }, + { + "auxiliary_loss_clip": 0.01522406, + "auxiliary_loss_mlp": 0.01350843, + "balance_loss_clip": 1.16851842, + "balance_loss_mlp": 1.05920935, + "epoch": 0.31599831659953104, + "flos": 22202456094720.0, + "grad_norm": 1.58672469786461, + "language_loss": 0.8404789, + "learning_rate": 3.2017559633432516e-06, + "loss": 0.86921138, + "num_input_tokens_seen": 56586715, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.91796875, + "step": 2628, + "time_per_iteration": 3.7982912063598633 + }, + { + "auxiliary_loss_clip": 0.01524046, + "auxiliary_loss_mlp": 0.01356818, + "balance_loss_clip": 1.16914463, + "balance_loss_mlp": 1.06823659, + "epoch": 0.31611855949017015, + "flos": 25595846595360.0, + "grad_norm": 2.062336026898178, + "language_loss": 0.66515046, + "learning_rate": 3.2011332084174398e-06, + "loss": 0.69395912, + "num_input_tokens_seen": 56607585, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.890625, + "step": 2629, + "time_per_iteration": 2.9357142448425293 + }, + { + "auxiliary_loss_clip": 0.01520817, + "auxiliary_loss_mlp": 0.01347188, + "balance_loss_clip": 1.16662216, + "balance_loss_mlp": 1.05765271, + "epoch": 0.31623880238080926, + "flos": 20596144560480.0, + "grad_norm": 1.695867500571106, + "language_loss": 0.8906467, + "learning_rate": 3.2005102712806756e-06, + "loss": 0.91932678, + "num_input_tokens_seen": 56626415, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.89648438, + "step": 2630, + "time_per_iteration": 3.874530076980591 + }, + { + "auxiliary_loss_clip": 0.015248, + "auxiliary_loss_mlp": 0.01342075, + "balance_loss_clip": 1.17103982, + "balance_loss_mlp": 1.05005956, + "epoch": 0.3163590452714483, + "flos": 12786468811200.0, + "grad_norm": 2.5182474810380713, + "language_loss": 0.72785944, + "learning_rate": 3.1998871520274575e-06, + "loss": 0.75652814, + "num_input_tokens_seen": 56641750, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.91992188, + "step": 2631, + "time_per_iteration": 2.971750259399414 + }, + { + "auxiliary_loss_clip": 0.01521643, + "auxiliary_loss_mlp": 0.0132912, + "balance_loss_clip": 1.167395, + "balance_loss_mlp": 1.03882146, + "epoch": 0.3164792881620874, + "flos": 23043635206560.0, + "grad_norm": 2.0064729792165483, + "language_loss": 0.85013008, + "learning_rate": 3.199263850752312e-06, + "loss": 0.87863773, + "num_input_tokens_seen": 56662585, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.90429688, + "step": 2632, + "time_per_iteration": 2.9680593013763428 + }, + { + "auxiliary_loss_clip": 0.01522206, + "auxiliary_loss_mlp": 0.01342366, + "balance_loss_clip": 1.16863084, + "balance_loss_mlp": 1.05340314, + "epoch": 0.31659953105272653, + "flos": 18298244034720.0, + "grad_norm": 2.346408370702437, + "language_loss": 0.85822976, + "learning_rate": 3.198640367549795e-06, + "loss": 0.88687551, + "num_input_tokens_seen": 56681480, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.89257812, + "step": 2633, + "time_per_iteration": 3.0819196701049805 + }, + { + "auxiliary_loss_clip": 0.01517413, + "auxiliary_loss_mlp": 0.01345006, + "balance_loss_clip": 1.16297603, + "balance_loss_mlp": 1.05775917, + "epoch": 0.3167197739433656, + "flos": 25705839352320.0, + "grad_norm": 1.6887021254891685, + "language_loss": 0.85835934, + "learning_rate": 3.198016702514487e-06, + "loss": 0.88698351, + "num_input_tokens_seen": 56701760, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.87695312, + "step": 2634, + "time_per_iteration": 2.9698169231414795 + }, + { + "auxiliary_loss_clip": 0.01530668, + "auxiliary_loss_mlp": 0.01333434, + "balance_loss_clip": 1.17751551, + "balance_loss_mlp": 1.04447103, + "epoch": 0.3168400168340047, + "flos": 23548046906880.0, + "grad_norm": 1.8211835308424922, + "language_loss": 0.84297788, + "learning_rate": 3.1973928557409972e-06, + "loss": 0.87161893, + "num_input_tokens_seen": 56719800, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.890625, + "step": 2635, + "time_per_iteration": 3.0568530559539795 + }, + { + "auxiliary_loss_clip": 0.01525198, + "auxiliary_loss_mlp": 0.0133338, + "balance_loss_clip": 1.17115831, + "balance_loss_mlp": 1.04422641, + "epoch": 0.31696025972464376, + "flos": 28368991702080.0, + "grad_norm": 1.9535399203316814, + "language_loss": 0.71239161, + "learning_rate": 3.1967688273239636e-06, + "loss": 0.74097729, + "num_input_tokens_seen": 56739605, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.89453125, + "step": 2636, + "time_per_iteration": 3.0328593254089355 + }, + { + "auxiliary_loss_clip": 0.01533797, + "auxiliary_loss_mlp": 0.0132795, + "balance_loss_clip": 1.18098569, + "balance_loss_mlp": 1.04013097, + "epoch": 0.31708050261528287, + "flos": 16400982670560.0, + "grad_norm": 1.7697935871201513, + "language_loss": 0.82223594, + "learning_rate": 3.1961446173580503e-06, + "loss": 0.85085344, + "num_input_tokens_seen": 56756545, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.8828125, + "step": 2637, + "time_per_iteration": 2.962622880935669 + }, + { + "auxiliary_loss_clip": 0.01529131, + "auxiliary_loss_mlp": 0.01328902, + "balance_loss_clip": 1.1736089, + "balance_loss_mlp": 1.03993881, + "epoch": 0.317200745505922, + "flos": 26214688647360.0, + "grad_norm": 1.9234794765988543, + "language_loss": 0.77247214, + "learning_rate": 3.1955202259379502e-06, + "loss": 0.80105257, + "num_input_tokens_seen": 56778275, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.89648438, + "step": 2638, + "time_per_iteration": 3.084254026412964 + }, + { + "auxiliary_loss_clip": 0.01524442, + "auxiliary_loss_mlp": 0.01322464, + "balance_loss_clip": 1.16969788, + "balance_loss_mlp": 1.0294956, + "epoch": 0.31732098839656103, + "flos": 31353171276960.0, + "grad_norm": 2.2270160577857294, + "language_loss": 0.83261037, + "learning_rate": 3.194895653158381e-06, + "loss": 0.86107945, + "num_input_tokens_seen": 56797215, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.9296875, + "step": 2639, + "time_per_iteration": 3.0629830360412598 + }, + { + "auxiliary_loss_clip": 0.01513989, + "auxiliary_loss_mlp": 0.01246719, + "balance_loss_clip": 1.16218162, + "balance_loss_mlp": 1.00944519, + "epoch": 0.31744123128720014, + "flos": 58995944848800.0, + "grad_norm": 0.7755021968201817, + "language_loss": 0.5550158, + "learning_rate": 3.194270899114093e-06, + "loss": 0.58262289, + "num_input_tokens_seen": 56863010, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.3671875, + "step": 2640, + "time_per_iteration": 3.6296634674072266 + }, + { + "auxiliary_loss_clip": 0.01525181, + "auxiliary_loss_mlp": 0.01323824, + "balance_loss_clip": 1.17159867, + "balance_loss_mlp": 1.03219008, + "epoch": 0.31756147417783925, + "flos": 17419326039360.0, + "grad_norm": 2.953444648451574, + "language_loss": 0.82090348, + "learning_rate": 3.193645963899858e-06, + "loss": 0.84939349, + "num_input_tokens_seen": 56880625, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.91796875, + "step": 2641, + "time_per_iteration": 3.109017848968506 + }, + { + "auxiliary_loss_clip": 0.01530457, + "auxiliary_loss_mlp": 0.0132335, + "balance_loss_clip": 1.17765605, + "balance_loss_mlp": 1.02999961, + "epoch": 0.3176817170684783, + "flos": 25483919502240.0, + "grad_norm": 1.816413775668648, + "language_loss": 0.84067947, + "learning_rate": 3.193020847610479e-06, + "loss": 0.86921751, + "num_input_tokens_seen": 56900945, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.93359375, + "step": 2642, + "time_per_iteration": 3.0463764667510986 + }, + { + "auxiliary_loss_clip": 0.01525165, + "auxiliary_loss_mlp": 0.01331142, + "balance_loss_clip": 1.172194, + "balance_loss_mlp": 1.03645682, + "epoch": 0.3178019599591174, + "flos": 24974918494560.0, + "grad_norm": 2.6206488324477193, + "language_loss": 0.71677357, + "learning_rate": 3.192395550340787e-06, + "loss": 0.74533665, + "num_input_tokens_seen": 56918895, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.9453125, + "step": 2643, + "time_per_iteration": 3.018808603286743 + }, + { + "auxiliary_loss_clip": 0.01522776, + "auxiliary_loss_mlp": 0.01331445, + "balance_loss_clip": 1.16990113, + "balance_loss_mlp": 1.04133725, + "epoch": 0.31792220284975653, + "flos": 12423871958400.0, + "grad_norm": 2.2797576677205647, + "language_loss": 0.76870495, + "learning_rate": 3.191770072185638e-06, + "loss": 0.79724717, + "num_input_tokens_seen": 56935890, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.90429688, + "step": 2644, + "time_per_iteration": 2.9780113697052 + }, + { + "auxiliary_loss_clip": 0.01524885, + "auxiliary_loss_mlp": 0.01326624, + "balance_loss_clip": 1.17159438, + "balance_loss_mlp": 1.03403628, + "epoch": 0.3180424457403956, + "flos": 15487511757120.0, + "grad_norm": 3.933418612375881, + "language_loss": 0.7207045, + "learning_rate": 3.191144413239916e-06, + "loss": 0.7492196, + "num_input_tokens_seen": 56952460, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.92578125, + "step": 2645, + "time_per_iteration": 2.9004809856414795 + }, + { + "auxiliary_loss_clip": 0.01527239, + "auxiliary_loss_mlp": 0.01330045, + "balance_loss_clip": 1.1731168, + "balance_loss_mlp": 1.03345263, + "epoch": 0.3181626886310347, + "flos": 26177253189120.0, + "grad_norm": 4.10693115294756, + "language_loss": 0.88515097, + "learning_rate": 3.190518573598534e-06, + "loss": 0.91372383, + "num_input_tokens_seen": 56969065, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.96484375, + "step": 2646, + "time_per_iteration": 3.0138778686523438 + }, + { + "auxiliary_loss_clip": 0.01522967, + "auxiliary_loss_mlp": 0.01330452, + "balance_loss_clip": 1.16843843, + "balance_loss_mlp": 1.03805542, + "epoch": 0.3182829315216738, + "flos": 25485436628640.0, + "grad_norm": 1.4995551427173257, + "language_loss": 0.77765745, + "learning_rate": 3.1898925533564308e-06, + "loss": 0.80619168, + "num_input_tokens_seen": 56990535, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.92578125, + "step": 2647, + "time_per_iteration": 2.9505972862243652 + }, + { + "auxiliary_loss_clip": 0.01521963, + "auxiliary_loss_mlp": 0.01339683, + "balance_loss_clip": 1.16844463, + "balance_loss_mlp": 1.0469054, + "epoch": 0.31840317441231286, + "flos": 18115656050880.0, + "grad_norm": 2.111801682441309, + "language_loss": 0.64362794, + "learning_rate": 3.1892663526085733e-06, + "loss": 0.67224443, + "num_input_tokens_seen": 57008910, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.92773438, + "step": 2648, + "time_per_iteration": 3.0180745124816895 + }, + { + "auxiliary_loss_clip": 0.01511698, + "auxiliary_loss_mlp": 0.01258484, + "balance_loss_clip": 1.1588738, + "balance_loss_mlp": 1.02502441, + "epoch": 0.31852341730295197, + "flos": 64748528510400.0, + "grad_norm": 0.7690081697593337, + "language_loss": 0.5687924, + "learning_rate": 3.188639971449956e-06, + "loss": 0.59649414, + "num_input_tokens_seen": 57074960, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.328125, + "step": 2649, + "time_per_iteration": 3.416991710662842 + }, + { + "auxiliary_loss_clip": 0.01524723, + "auxiliary_loss_mlp": 0.01348566, + "balance_loss_clip": 1.16950798, + "balance_loss_mlp": 1.04949415, + "epoch": 0.318643660193591, + "flos": 20670256913760.0, + "grad_norm": 2.847142906923379, + "language_loss": 0.72468805, + "learning_rate": 3.1880134099756e-06, + "loss": 0.75342095, + "num_input_tokens_seen": 57094595, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.98632812, + "step": 2650, + "time_per_iteration": 3.010758399963379 + }, + { + "auxiliary_loss_clip": 0.0152138, + "auxiliary_loss_mlp": 0.01343658, + "balance_loss_clip": 1.16561043, + "balance_loss_mlp": 1.04992664, + "epoch": 0.31876390308423014, + "flos": 26945799145920.0, + "grad_norm": 1.9332863630649237, + "language_loss": 0.69880342, + "learning_rate": 3.1873866682805535e-06, + "loss": 0.72745377, + "num_input_tokens_seen": 57115290, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.9375, + "step": 2651, + "time_per_iteration": 3.072295665740967 + }, + { + "auxiliary_loss_clip": 0.01524363, + "auxiliary_loss_mlp": 0.01336636, + "balance_loss_clip": 1.16935825, + "balance_loss_mlp": 1.0421412, + "epoch": 0.31888414597486925, + "flos": 18043933171680.0, + "grad_norm": 1.8478550347875176, + "language_loss": 0.8869018, + "learning_rate": 3.186759746459894e-06, + "loss": 0.91551179, + "num_input_tokens_seen": 57134400, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.9453125, + "step": 2652, + "time_per_iteration": 4.714097738265991 + }, + { + "auxiliary_loss_clip": 0.0152321, + "auxiliary_loss_mlp": 0.01337489, + "balance_loss_clip": 1.16874313, + "balance_loss_mlp": 1.04261279, + "epoch": 0.3190043888655083, + "flos": 25151324188320.0, + "grad_norm": 2.0648656158694534, + "language_loss": 0.79719442, + "learning_rate": 3.1861326446087246e-06, + "loss": 0.82580137, + "num_input_tokens_seen": 57153140, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.9453125, + "step": 2653, + "time_per_iteration": 3.0544564723968506 + }, + { + "auxiliary_loss_clip": 0.01520709, + "auxiliary_loss_mlp": 0.01318073, + "balance_loss_clip": 1.16494596, + "balance_loss_mlp": 1.02758384, + "epoch": 0.3191246317561474, + "flos": 22056051939840.0, + "grad_norm": 2.4685943880809598, + "language_loss": 0.72060567, + "learning_rate": 3.1855053628221763e-06, + "loss": 0.74899352, + "num_input_tokens_seen": 57172395, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.90820312, + "step": 2654, + "time_per_iteration": 3.14713978767395 + }, + { + "auxiliary_loss_clip": 0.01526281, + "auxiliary_loss_mlp": 0.01327404, + "balance_loss_clip": 1.17044222, + "balance_loss_mlp": 1.03557932, + "epoch": 0.3192448746467865, + "flos": 14903526048480.0, + "grad_norm": 2.8894184759518566, + "language_loss": 0.89833999, + "learning_rate": 3.184877901195407e-06, + "loss": 0.92687684, + "num_input_tokens_seen": 57189090, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.91992188, + "step": 2655, + "time_per_iteration": 3.8782079219818115 + }, + { + "auxiliary_loss_clip": 0.01517713, + "auxiliary_loss_mlp": 0.01235237, + "balance_loss_clip": 1.16119468, + "balance_loss_mlp": 1.00254059, + "epoch": 0.3193651175374256, + "flos": 67242216019680.0, + "grad_norm": 0.8060073494112184, + "language_loss": 0.62834769, + "learning_rate": 3.184250259823602e-06, + "loss": 0.65587723, + "num_input_tokens_seen": 57251620, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3203125, + "step": 2656, + "time_per_iteration": 4.37033224105835 + }, + { + "auxiliary_loss_clip": 0.01521468, + "auxiliary_loss_mlp": 0.01329635, + "balance_loss_clip": 1.16582155, + "balance_loss_mlp": 1.03723872, + "epoch": 0.3194853604280647, + "flos": 12234343121280.0, + "grad_norm": 2.5357622267192848, + "language_loss": 0.81858593, + "learning_rate": 3.183622438801974e-06, + "loss": 0.84709692, + "num_input_tokens_seen": 57266910, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.92382812, + "step": 2657, + "time_per_iteration": 3.0285439491271973 + }, + { + "auxiliary_loss_clip": 0.01527406, + "auxiliary_loss_mlp": 0.01323394, + "balance_loss_clip": 1.17201948, + "balance_loss_mlp": 1.03347671, + "epoch": 0.3196056033187038, + "flos": 14941720069920.0, + "grad_norm": 5.899733032693576, + "language_loss": 0.749488, + "learning_rate": 3.1829944382257637e-06, + "loss": 0.77799594, + "num_input_tokens_seen": 57285040, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.90234375, + "step": 2658, + "time_per_iteration": 3.046788215637207 + }, + { + "auxiliary_loss_clip": 0.015213, + "auxiliary_loss_mlp": 0.01327612, + "balance_loss_clip": 1.16598415, + "balance_loss_mlp": 1.03712273, + "epoch": 0.31972584620934286, + "flos": 23771142529920.0, + "grad_norm": 2.36688426111828, + "language_loss": 0.8196032, + "learning_rate": 3.1823662581902373e-06, + "loss": 0.84809232, + "num_input_tokens_seen": 57302725, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.90625, + "step": 2659, + "time_per_iteration": 3.100625991821289 + }, + { + "auxiliary_loss_clip": 0.01521682, + "auxiliary_loss_mlp": 0.01323137, + "balance_loss_clip": 1.16668177, + "balance_loss_mlp": 1.03588986, + "epoch": 0.31984608909998197, + "flos": 21253673700000.0, + "grad_norm": 2.257681133460518, + "language_loss": 0.74481106, + "learning_rate": 3.1817378987906896e-06, + "loss": 0.77325928, + "num_input_tokens_seen": 57322230, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.87890625, + "step": 2660, + "time_per_iteration": 3.082346200942993 + }, + { + "auxiliary_loss_clip": 0.01526242, + "auxiliary_loss_mlp": 0.01318073, + "balance_loss_clip": 1.17164969, + "balance_loss_mlp": 1.02739334, + "epoch": 0.3199663319906211, + "flos": 18298206106560.0, + "grad_norm": 22.68382687745805, + "language_loss": 0.80133295, + "learning_rate": 3.181109360122442e-06, + "loss": 0.82977605, + "num_input_tokens_seen": 57339820, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.90820312, + "step": 2661, + "time_per_iteration": 3.0232722759246826 + }, + { + "auxiliary_loss_clip": 0.0152, + "auxiliary_loss_mlp": 0.01328361, + "balance_loss_clip": 1.1636579, + "balance_loss_mlp": 1.03768134, + "epoch": 0.32008657488126013, + "flos": 18735598019520.0, + "grad_norm": 2.83547991984373, + "language_loss": 0.78625691, + "learning_rate": 3.1804806422808445e-06, + "loss": 0.81474048, + "num_input_tokens_seen": 57356955, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.90820312, + "step": 2662, + "time_per_iteration": 2.9405016899108887 + }, + { + "auxiliary_loss_clip": 0.01523992, + "auxiliary_loss_mlp": 0.01323473, + "balance_loss_clip": 1.16850305, + "balance_loss_mlp": 1.0314579, + "epoch": 0.32020681777189924, + "flos": 20597472046080.0, + "grad_norm": 2.6862526930254687, + "language_loss": 0.73037148, + "learning_rate": 3.1798517453612714e-06, + "loss": 0.75884616, + "num_input_tokens_seen": 57376760, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.921875, + "step": 2663, + "time_per_iteration": 2.9794018268585205 + }, + { + "auxiliary_loss_clip": 0.01527773, + "auxiliary_loss_mlp": 0.01320369, + "balance_loss_clip": 1.17164469, + "balance_loss_mlp": 1.03121448, + "epoch": 0.32032706066253835, + "flos": 35264589687360.0, + "grad_norm": 2.125897646107858, + "language_loss": 0.75086176, + "learning_rate": 3.1792226694591265e-06, + "loss": 0.77934313, + "num_input_tokens_seen": 57398145, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.89453125, + "step": 2664, + "time_per_iteration": 3.0797312259674072 + }, + { + "auxiliary_loss_clip": 0.01522903, + "auxiliary_loss_mlp": 0.01321783, + "balance_loss_clip": 1.16765547, + "balance_loss_mlp": 1.03301024, + "epoch": 0.3204473035531774, + "flos": 15306023689920.0, + "grad_norm": 2.087133804417857, + "language_loss": 0.80360854, + "learning_rate": 3.178593414669841e-06, + "loss": 0.83205545, + "num_input_tokens_seen": 57416730, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.89257812, + "step": 2665, + "time_per_iteration": 3.0726568698883057 + }, + { + "auxiliary_loss_clip": 0.01520472, + "auxiliary_loss_mlp": 0.01331935, + "balance_loss_clip": 1.1641618, + "balance_loss_mlp": 1.04049206, + "epoch": 0.3205675464438165, + "flos": 24464893426560.0, + "grad_norm": 3.318685613225918, + "language_loss": 0.70162034, + "learning_rate": 3.1779639810888707e-06, + "loss": 0.73014438, + "num_input_tokens_seen": 57436325, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.9140625, + "step": 2666, + "time_per_iteration": 3.1708359718322754 + }, + { + "auxiliary_loss_clip": 0.01521668, + "auxiliary_loss_mlp": 0.01330071, + "balance_loss_clip": 1.16613293, + "balance_loss_mlp": 1.03939104, + "epoch": 0.3206877893344556, + "flos": 22458473724960.0, + "grad_norm": 2.372691357227519, + "language_loss": 0.76222491, + "learning_rate": 3.1773343688117013e-06, + "loss": 0.79074222, + "num_input_tokens_seen": 57457235, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.90820312, + "step": 2667, + "time_per_iteration": 3.0240375995635986 + }, + { + "auxiliary_loss_clip": 0.01520508, + "auxiliary_loss_mlp": 0.01317271, + "balance_loss_clip": 1.16451144, + "balance_loss_mlp": 1.02716374, + "epoch": 0.3208080322250947, + "flos": 20414315139840.0, + "grad_norm": 2.0142275178529956, + "language_loss": 0.84130049, + "learning_rate": 3.1767045779338445e-06, + "loss": 0.86967826, + "num_input_tokens_seen": 57474895, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.90625, + "step": 2668, + "time_per_iteration": 2.9977705478668213 + }, + { + "auxiliary_loss_clip": 0.01516764, + "auxiliary_loss_mlp": 0.01316368, + "balance_loss_clip": 1.16020107, + "balance_loss_mlp": 1.02511644, + "epoch": 0.3209282751157338, + "flos": 21764267690400.0, + "grad_norm": 2.438499391112727, + "language_loss": 0.9133693, + "learning_rate": 3.176074608550839e-06, + "loss": 0.94170058, + "num_input_tokens_seen": 57490715, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.91210938, + "step": 2669, + "time_per_iteration": 3.038308620452881 + }, + { + "auxiliary_loss_clip": 0.01525631, + "auxiliary_loss_mlp": 0.01323737, + "balance_loss_clip": 1.17018294, + "balance_loss_mlp": 1.0343926, + "epoch": 0.32104851800637285, + "flos": 22057265640960.0, + "grad_norm": 2.4956446844957307, + "language_loss": 0.82969576, + "learning_rate": 3.17544446075825e-06, + "loss": 0.85818946, + "num_input_tokens_seen": 57509880, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.89648438, + "step": 2670, + "time_per_iteration": 2.978254556655884 + }, + { + "auxiliary_loss_clip": 0.01515458, + "auxiliary_loss_mlp": 0.01323638, + "balance_loss_clip": 1.1596067, + "balance_loss_mlp": 1.03486514, + "epoch": 0.32116876089701196, + "flos": 37015257255840.0, + "grad_norm": 1.6169547376403142, + "language_loss": 0.70804155, + "learning_rate": 3.174814134651671e-06, + "loss": 0.73643255, + "num_input_tokens_seen": 57532430, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.89453125, + "step": 2671, + "time_per_iteration": 3.153388738632202 + }, + { + "auxiliary_loss_clip": 0.01523471, + "auxiliary_loss_mlp": 0.01324055, + "balance_loss_clip": 1.16642189, + "balance_loss_mlp": 1.0379529, + "epoch": 0.3212890037876511, + "flos": 21981294807840.0, + "grad_norm": 2.008893878559021, + "language_loss": 0.8047784, + "learning_rate": 3.1741836303267215e-06, + "loss": 0.83325362, + "num_input_tokens_seen": 57551965, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.8671875, + "step": 2672, + "time_per_iteration": 2.996609687805176 + }, + { + "auxiliary_loss_clip": 0.01521609, + "auxiliary_loss_mlp": 0.01321554, + "balance_loss_clip": 1.16366649, + "balance_loss_mlp": 1.03259087, + "epoch": 0.32140924667829013, + "flos": 10343870897760.0, + "grad_norm": 2.6187336120008173, + "language_loss": 0.75115073, + "learning_rate": 3.1735529478790496e-06, + "loss": 0.77958238, + "num_input_tokens_seen": 57569955, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.89257812, + "step": 2673, + "time_per_iteration": 3.0851218700408936 + }, + { + "auxiliary_loss_clip": 0.01520071, + "auxiliary_loss_mlp": 0.01328437, + "balance_loss_clip": 1.1635114, + "balance_loss_mlp": 1.0371846, + "epoch": 0.32152948956892924, + "flos": 50802356985120.0, + "grad_norm": 1.8641702010146763, + "language_loss": 0.79371005, + "learning_rate": 3.172922087404328e-06, + "loss": 0.82219517, + "num_input_tokens_seen": 57592215, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.91210938, + "step": 2674, + "time_per_iteration": 3.352250814437866 + }, + { + "auxiliary_loss_clip": 0.01517447, + "auxiliary_loss_mlp": 0.01241386, + "balance_loss_clip": 1.159904, + "balance_loss_mlp": 1.007164, + "epoch": 0.32164973245956835, + "flos": 63869610515040.0, + "grad_norm": 0.7878362121592234, + "language_loss": 0.55169237, + "learning_rate": 3.1722910489982586e-06, + "loss": 0.57928073, + "num_input_tokens_seen": 57652575, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.3359375, + "step": 2675, + "time_per_iteration": 3.6043031215667725 + }, + { + "auxiliary_loss_clip": 0.01525085, + "auxiliary_loss_mlp": 0.013261, + "balance_loss_clip": 1.16707706, + "balance_loss_mlp": 1.03446627, + "epoch": 0.3217699753502074, + "flos": 23516224816320.0, + "grad_norm": 1.6354385672218692, + "language_loss": 0.79892832, + "learning_rate": 3.1716598327565694e-06, + "loss": 0.82744014, + "num_input_tokens_seen": 57672215, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.91796875, + "step": 2676, + "time_per_iteration": 3.0321967601776123 + }, + { + "auxiliary_loss_clip": 0.01514788, + "auxiliary_loss_mlp": 0.01309129, + "balance_loss_clip": 1.15723956, + "balance_loss_mlp": 1.02283597, + "epoch": 0.3218902182408465, + "flos": 19064552230080.0, + "grad_norm": 1.9583015309144252, + "language_loss": 0.84196603, + "learning_rate": 3.171028438775015e-06, + "loss": 0.87020522, + "num_input_tokens_seen": 57691410, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.86914062, + "step": 2677, + "time_per_iteration": 3.102994680404663 + }, + { + "auxiliary_loss_clip": 0.0151416, + "auxiliary_loss_mlp": 0.01307491, + "balance_loss_clip": 1.15685344, + "balance_loss_mlp": 1.02043533, + "epoch": 0.3220104611314856, + "flos": 20377638244800.0, + "grad_norm": 2.4361555476947285, + "language_loss": 0.84326243, + "learning_rate": 3.170396867149377e-06, + "loss": 0.87147903, + "num_input_tokens_seen": 57709415, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.87304688, + "step": 2678, + "time_per_iteration": 3.1009740829467773 + }, + { + "auxiliary_loss_clip": 0.01523683, + "auxiliary_loss_mlp": 0.0132941, + "balance_loss_clip": 1.16654611, + "balance_loss_mlp": 1.03911185, + "epoch": 0.3221307040221247, + "flos": 20118889787040.0, + "grad_norm": 1.7720392017741917, + "language_loss": 0.86804456, + "learning_rate": 3.1697651179754653e-06, + "loss": 0.89657545, + "num_input_tokens_seen": 57728075, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.90429688, + "step": 2679, + "time_per_iteration": 4.006077527999878 + }, + { + "auxiliary_loss_clip": 0.0152762, + "auxiliary_loss_mlp": 0.01318695, + "balance_loss_clip": 1.17154849, + "balance_loss_mlp": 1.03068542, + "epoch": 0.3222509469127638, + "flos": 23990066055360.0, + "grad_norm": 1.8399071495124577, + "language_loss": 0.73150814, + "learning_rate": 3.1691331913491153e-06, + "loss": 0.75997132, + "num_input_tokens_seen": 57750645, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.88476562, + "step": 2680, + "time_per_iteration": 3.1420936584472656 + }, + { + "auxiliary_loss_clip": 0.01522385, + "auxiliary_loss_mlp": 0.01316399, + "balance_loss_clip": 1.16509438, + "balance_loss_mlp": 1.0281986, + "epoch": 0.32237118980340285, + "flos": 17677657287360.0, + "grad_norm": 2.1426891295363752, + "language_loss": 0.84912461, + "learning_rate": 3.1685010873661898e-06, + "loss": 0.87751245, + "num_input_tokens_seen": 57769820, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.88671875, + "step": 2681, + "time_per_iteration": 2.9479641914367676 + }, + { + "auxiliary_loss_clip": 0.0152505, + "auxiliary_loss_mlp": 0.01322049, + "balance_loss_clip": 1.16911018, + "balance_loss_mlp": 1.03251314, + "epoch": 0.32249143269404196, + "flos": 23150176500960.0, + "grad_norm": 2.448067720514711, + "language_loss": 0.79721236, + "learning_rate": 3.167868806122578e-06, + "loss": 0.8256833, + "num_input_tokens_seen": 57788870, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.8984375, + "step": 2682, + "time_per_iteration": 3.0024361610412598 + }, + { + "auxiliary_loss_clip": 0.01526204, + "auxiliary_loss_mlp": 0.0132228, + "balance_loss_clip": 1.17033315, + "balance_loss_mlp": 1.03522384, + "epoch": 0.32261167558468107, + "flos": 24424423715520.0, + "grad_norm": 1.836368872183076, + "language_loss": 0.66422194, + "learning_rate": 3.1672363477141968e-06, + "loss": 0.69270682, + "num_input_tokens_seen": 57808165, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.875, + "step": 2683, + "time_per_iteration": 4.712559223175049 + }, + { + "auxiliary_loss_clip": 0.01517739, + "auxiliary_loss_mlp": 0.01322724, + "balance_loss_clip": 1.16116011, + "balance_loss_mlp": 1.03280759, + "epoch": 0.3227319184753201, + "flos": 30369494610720.0, + "grad_norm": 2.1714396210992386, + "language_loss": 0.84954417, + "learning_rate": 3.1666037122369903e-06, + "loss": 0.87794876, + "num_input_tokens_seen": 57828825, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.90039062, + "step": 2684, + "time_per_iteration": 3.0335280895233154 + }, + { + "auxiliary_loss_clip": 0.01518032, + "auxiliary_loss_mlp": 0.01330312, + "balance_loss_clip": 1.16043961, + "balance_loss_mlp": 1.03944159, + "epoch": 0.32285216136595923, + "flos": 16948063915200.0, + "grad_norm": 2.6846836145731054, + "language_loss": 0.86901796, + "learning_rate": 3.165970899786928e-06, + "loss": 0.89750147, + "num_input_tokens_seen": 57846740, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.91015625, + "step": 2685, + "time_per_iteration": 3.0406715869903564 + }, + { + "auxiliary_loss_clip": 0.01520047, + "auxiliary_loss_mlp": 0.01323757, + "balance_loss_clip": 1.16310453, + "balance_loss_mlp": 1.03269553, + "epoch": 0.32297240425659834, + "flos": 21983722210080.0, + "grad_norm": 2.007613122279743, + "language_loss": 0.75712258, + "learning_rate": 3.1653379104600067e-06, + "loss": 0.78556061, + "num_input_tokens_seen": 57866885, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.91210938, + "step": 2686, + "time_per_iteration": 2.9981656074523926 + }, + { + "auxiliary_loss_clip": 0.01518246, + "auxiliary_loss_mlp": 0.01335005, + "balance_loss_clip": 1.16200292, + "balance_loss_mlp": 1.04203629, + "epoch": 0.3230926471472374, + "flos": 22750295902560.0, + "grad_norm": 1.8203561641495394, + "language_loss": 0.69435287, + "learning_rate": 3.164704744352251e-06, + "loss": 0.72288537, + "num_input_tokens_seen": 57887690, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.92773438, + "step": 2687, + "time_per_iteration": 3.0148916244506836 + }, + { + "auxiliary_loss_clip": 0.01514295, + "auxiliary_loss_mlp": 0.01322097, + "balance_loss_clip": 1.15824819, + "balance_loss_mlp": 1.03122675, + "epoch": 0.3232128900378765, + "flos": 16944688308960.0, + "grad_norm": 1.7459159288943777, + "language_loss": 0.80858159, + "learning_rate": 3.164071401559713e-06, + "loss": 0.83694553, + "num_input_tokens_seen": 57905090, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.90820312, + "step": 2688, + "time_per_iteration": 3.1835708618164062 + }, + { + "auxiliary_loss_clip": 0.0151652, + "auxiliary_loss_mlp": 0.01323535, + "balance_loss_clip": 1.15920281, + "balance_loss_mlp": 1.0338093, + "epoch": 0.3233331329285156, + "flos": 24025984387200.0, + "grad_norm": 1.945350143104069, + "language_loss": 0.71134967, + "learning_rate": 3.1634378821784674e-06, + "loss": 0.73975021, + "num_input_tokens_seen": 57925305, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.8984375, + "step": 2689, + "time_per_iteration": 3.046875 + }, + { + "auxiliary_loss_clip": 0.01518295, + "auxiliary_loss_mlp": 0.01327603, + "balance_loss_clip": 1.16062677, + "balance_loss_mlp": 1.03749526, + "epoch": 0.3234533758191547, + "flos": 18115807763520.0, + "grad_norm": 2.9387555827665635, + "language_loss": 0.73605424, + "learning_rate": 3.1628041863046208e-06, + "loss": 0.76451325, + "num_input_tokens_seen": 57942720, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.90234375, + "step": 2690, + "time_per_iteration": 3.1478497982025146 + }, + { + "auxiliary_loss_clip": 0.01512063, + "auxiliary_loss_mlp": 0.01327325, + "balance_loss_clip": 1.15516996, + "balance_loss_mlp": 1.03473747, + "epoch": 0.3235736187097938, + "flos": 16948025987040.0, + "grad_norm": 2.3531912604564202, + "language_loss": 0.91247511, + "learning_rate": 3.162170314034304e-06, + "loss": 0.94086897, + "num_input_tokens_seen": 57960135, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.92578125, + "step": 2691, + "time_per_iteration": 3.179025411605835 + }, + { + "auxiliary_loss_clip": 0.01516446, + "auxiliary_loss_mlp": 0.01317613, + "balance_loss_clip": 1.15913975, + "balance_loss_mlp": 1.02597964, + "epoch": 0.3236938616004329, + "flos": 22129329873600.0, + "grad_norm": 1.7892135139738927, + "language_loss": 0.81200516, + "learning_rate": 3.1615362654636738e-06, + "loss": 0.84034574, + "num_input_tokens_seen": 57980875, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.91796875, + "step": 2692, + "time_per_iteration": 3.1013731956481934 + }, + { + "auxiliary_loss_clip": 0.01520064, + "auxiliary_loss_mlp": 0.01323095, + "balance_loss_clip": 1.16278255, + "balance_loss_mlp": 1.03737378, + "epoch": 0.32381410449107195, + "flos": 17166494374560.0, + "grad_norm": 1.850946087283365, + "language_loss": 0.87199831, + "learning_rate": 3.1609020406889163e-06, + "loss": 0.90042996, + "num_input_tokens_seen": 57998310, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.86132812, + "step": 2693, + "time_per_iteration": 3.0692625045776367 + }, + { + "auxiliary_loss_clip": 0.01511678, + "auxiliary_loss_mlp": 0.01330852, + "balance_loss_clip": 1.15351534, + "balance_loss_mlp": 1.03864598, + "epoch": 0.32393434738171106, + "flos": 16579815766560.0, + "grad_norm": 2.141335453939939, + "language_loss": 0.85085249, + "learning_rate": 3.1602676398062416e-06, + "loss": 0.87927783, + "num_input_tokens_seen": 58017220, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.921875, + "step": 2694, + "time_per_iteration": 3.0349035263061523 + }, + { + "auxiliary_loss_clip": 0.0150657, + "auxiliary_loss_mlp": 0.01321109, + "balance_loss_clip": 1.1479342, + "balance_loss_mlp": 1.03100097, + "epoch": 0.3240545902723502, + "flos": 25485967622880.0, + "grad_norm": 2.6766624340280964, + "language_loss": 0.61669248, + "learning_rate": 3.1596330629118886e-06, + "loss": 0.64496928, + "num_input_tokens_seen": 58037190, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.90234375, + "step": 2695, + "time_per_iteration": 3.0943212509155273 + }, + { + "auxiliary_loss_clip": 0.01510778, + "auxiliary_loss_mlp": 0.01332646, + "balance_loss_clip": 1.15263116, + "balance_loss_mlp": 1.04253805, + "epoch": 0.32417483316298923, + "flos": 35848802964960.0, + "grad_norm": 2.043820541615477, + "language_loss": 0.72891814, + "learning_rate": 3.1589983101021223e-06, + "loss": 0.75735235, + "num_input_tokens_seen": 58055820, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.90234375, + "step": 2696, + "time_per_iteration": 3.1342482566833496 + }, + { + "auxiliary_loss_clip": 0.01506825, + "auxiliary_loss_mlp": 0.01318576, + "balance_loss_clip": 1.14834785, + "balance_loss_mlp": 1.02827752, + "epoch": 0.32429507605362834, + "flos": 30083096160000.0, + "grad_norm": 2.0457889444456088, + "language_loss": 0.84913307, + "learning_rate": 3.1583633814732337e-06, + "loss": 0.87738705, + "num_input_tokens_seen": 58075340, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.90234375, + "step": 2697, + "time_per_iteration": 3.112316370010376 + }, + { + "auxiliary_loss_clip": 0.01506684, + "auxiliary_loss_mlp": 0.01309042, + "balance_loss_clip": 1.14782715, + "balance_loss_mlp": 1.02122355, + "epoch": 0.3244153189442674, + "flos": 18225231598080.0, + "grad_norm": 2.718288132123582, + "language_loss": 0.71734524, + "learning_rate": 3.157728277121541e-06, + "loss": 0.74550259, + "num_input_tokens_seen": 58093515, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.88085938, + "step": 2698, + "time_per_iteration": 3.169877767562866 + }, + { + "auxiliary_loss_clip": 0.01503771, + "auxiliary_loss_mlp": 0.01310746, + "balance_loss_clip": 1.14550614, + "balance_loss_mlp": 1.0210197, + "epoch": 0.3245355618349065, + "flos": 17712134349120.0, + "grad_norm": 3.3173362246408775, + "language_loss": 0.78316027, + "learning_rate": 3.1570929971433897e-06, + "loss": 0.8113054, + "num_input_tokens_seen": 58109300, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.90039062, + "step": 2699, + "time_per_iteration": 3.052483320236206 + }, + { + "auxiliary_loss_clip": 0.01507952, + "auxiliary_loss_mlp": 0.0131864, + "balance_loss_clip": 1.14985585, + "balance_loss_mlp": 1.02948618, + "epoch": 0.3246558047255456, + "flos": 23443060667040.0, + "grad_norm": 2.1303236823704257, + "language_loss": 0.84029567, + "learning_rate": 3.1564575416351504e-06, + "loss": 0.86856163, + "num_input_tokens_seen": 58128000, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.89257812, + "step": 2700, + "time_per_iteration": 3.067854166030884 + }, + { + "auxiliary_loss_clip": 0.01505511, + "auxiliary_loss_mlp": 0.01326318, + "balance_loss_clip": 1.1466341, + "balance_loss_mlp": 1.03907084, + "epoch": 0.32477604761618467, + "flos": 21762940204800.0, + "grad_norm": 3.1552585900202454, + "language_loss": 0.74142814, + "learning_rate": 3.155821910693221e-06, + "loss": 0.76974642, + "num_input_tokens_seen": 58147415, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.87695312, + "step": 2701, + "time_per_iteration": 3.0194733142852783 + }, + { + "auxiliary_loss_clip": 0.01508839, + "auxiliary_loss_mlp": 0.01317495, + "balance_loss_clip": 1.15102243, + "balance_loss_mlp": 1.0289135, + "epoch": 0.3248962905068238, + "flos": 19830367359360.0, + "grad_norm": 1.7272692606426838, + "language_loss": 0.86226112, + "learning_rate": 3.1551861044140275e-06, + "loss": 0.89052451, + "num_input_tokens_seen": 58167050, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.88867188, + "step": 2702, + "time_per_iteration": 2.9792141914367676 + }, + { + "auxiliary_loss_clip": 0.01516916, + "auxiliary_loss_mlp": 0.01322242, + "balance_loss_clip": 1.15814424, + "balance_loss_mlp": 1.03289747, + "epoch": 0.3250165333974629, + "flos": 23950316979360.0, + "grad_norm": 1.7972517453722474, + "language_loss": 0.77562279, + "learning_rate": 3.15455012289402e-06, + "loss": 0.80401438, + "num_input_tokens_seen": 58186695, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.89648438, + "step": 2703, + "time_per_iteration": 2.9877192974090576 + }, + { + "auxiliary_loss_clip": 0.0151408, + "auxiliary_loss_mlp": 0.01312531, + "balance_loss_clip": 1.15600395, + "balance_loss_mlp": 1.02394915, + "epoch": 0.32513677628810195, + "flos": 23991696966240.0, + "grad_norm": 1.8617058418630652, + "language_loss": 0.84283829, + "learning_rate": 3.153913966229677e-06, + "loss": 0.87110442, + "num_input_tokens_seen": 58205815, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.88867188, + "step": 2704, + "time_per_iteration": 2.988387107849121 + }, + { + "auxiliary_loss_clip": 0.01495131, + "auxiliary_loss_mlp": 0.01228745, + "balance_loss_clip": 1.13632321, + "balance_loss_mlp": 0.99452209, + "epoch": 0.32525701917874106, + "flos": 70662877231680.0, + "grad_norm": 0.6510809284539728, + "language_loss": 0.50208294, + "learning_rate": 3.1532776345175027e-06, + "loss": 0.52932167, + "num_input_tokens_seen": 58270960, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.3359375, + "step": 2705, + "time_per_iteration": 3.411600351333618 + }, + { + "auxiliary_loss_clip": 0.01513816, + "auxiliary_loss_mlp": 0.01314356, + "balance_loss_clip": 1.15561938, + "balance_loss_mlp": 1.02539253, + "epoch": 0.32537726206938017, + "flos": 19684683839520.0, + "grad_norm": 1.7855423313402512, + "language_loss": 0.78825819, + "learning_rate": 3.1526411278540285e-06, + "loss": 0.81654, + "num_input_tokens_seen": 58289390, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.89257812, + "step": 2706, + "time_per_iteration": 3.8822054862976074 + }, + { + "auxiliary_loss_clip": 0.01510198, + "auxiliary_loss_mlp": 0.01322574, + "balance_loss_clip": 1.15111673, + "balance_loss_mlp": 1.03017735, + "epoch": 0.3254975049600192, + "flos": 28763183076480.0, + "grad_norm": 2.4633036104032153, + "language_loss": 0.81061065, + "learning_rate": 3.1520044463358116e-06, + "loss": 0.83893836, + "num_input_tokens_seen": 58306120, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.91992188, + "step": 2707, + "time_per_iteration": 3.968499183654785 + }, + { + "auxiliary_loss_clip": 0.01514936, + "auxiliary_loss_mlp": 0.0131732, + "balance_loss_clip": 1.1556406, + "balance_loss_mlp": 1.02816546, + "epoch": 0.32561774785065833, + "flos": 18879233418720.0, + "grad_norm": 3.6800689620001985, + "language_loss": 0.80262589, + "learning_rate": 3.151367590059436e-06, + "loss": 0.83094847, + "num_input_tokens_seen": 58324545, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.89257812, + "step": 2708, + "time_per_iteration": 3.007904052734375 + }, + { + "auxiliary_loss_clip": 0.01515474, + "auxiliary_loss_mlp": 0.01319027, + "balance_loss_clip": 1.15660429, + "balance_loss_mlp": 1.03025484, + "epoch": 0.32573799074129745, + "flos": 23114334025440.0, + "grad_norm": 2.2001344381054166, + "language_loss": 0.86847472, + "learning_rate": 3.1507305591215117e-06, + "loss": 0.89681971, + "num_input_tokens_seen": 58342455, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.88867188, + "step": 2709, + "time_per_iteration": 3.029721736907959 + }, + { + "auxiliary_loss_clip": 0.01496433, + "auxiliary_loss_mlp": 0.01264732, + "balance_loss_clip": 1.13774586, + "balance_loss_mlp": 1.02974701, + "epoch": 0.3258582336319365, + "flos": 71244663107040.0, + "grad_norm": 0.6837815554740042, + "language_loss": 0.55688304, + "learning_rate": 3.150093353618677e-06, + "loss": 0.58449471, + "num_input_tokens_seen": 58407185, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.34375, + "step": 2710, + "time_per_iteration": 4.469970226287842 + }, + { + "auxiliary_loss_clip": 0.01514352, + "auxiliary_loss_mlp": 0.01319467, + "balance_loss_clip": 1.15599644, + "balance_loss_mlp": 1.03069425, + "epoch": 0.3259784765225756, + "flos": 22458246156000.0, + "grad_norm": 2.9850002413030245, + "language_loss": 0.88626897, + "learning_rate": 3.149455973647596e-06, + "loss": 0.91460717, + "num_input_tokens_seen": 58425245, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.890625, + "step": 2711, + "time_per_iteration": 3.9403769969940186 + }, + { + "auxiliary_loss_clip": 0.01512486, + "auxiliary_loss_mlp": 0.01321751, + "balance_loss_clip": 1.15315926, + "balance_loss_mlp": 1.03412247, + "epoch": 0.32609871941321467, + "flos": 20486872438560.0, + "grad_norm": 2.1628221278129747, + "language_loss": 0.7762754, + "learning_rate": 3.1488184193049563e-06, + "loss": 0.80461776, + "num_input_tokens_seen": 58444780, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.88085938, + "step": 2712, + "time_per_iteration": 3.132819890975952 + }, + { + "auxiliary_loss_clip": 0.0151684, + "auxiliary_loss_mlp": 0.01309309, + "balance_loss_clip": 1.15818429, + "balance_loss_mlp": 1.02339745, + "epoch": 0.3262189623038538, + "flos": 22418876361600.0, + "grad_norm": 1.6685996096505684, + "language_loss": 0.72327864, + "learning_rate": 3.1481806906874767e-06, + "loss": 0.75154018, + "num_input_tokens_seen": 58466090, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.86523438, + "step": 2713, + "time_per_iteration": 3.0459256172180176 + }, + { + "auxiliary_loss_clip": 0.01518138, + "auxiliary_loss_mlp": 0.01313751, + "balance_loss_clip": 1.15959847, + "balance_loss_mlp": 1.0291748, + "epoch": 0.3263392051944929, + "flos": 20925629765280.0, + "grad_norm": 1.6715694127159944, + "language_loss": 0.87653208, + "learning_rate": 3.147542787891899e-06, + "loss": 0.90485102, + "num_input_tokens_seen": 58485435, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.8515625, + "step": 2714, + "time_per_iteration": 3.0267786979675293 + }, + { + "auxiliary_loss_clip": 0.0151793, + "auxiliary_loss_mlp": 0.01312047, + "balance_loss_clip": 1.15849888, + "balance_loss_mlp": 1.02594495, + "epoch": 0.32645944808513194, + "flos": 24027501513600.0, + "grad_norm": 1.854713557998639, + "language_loss": 0.75443482, + "learning_rate": 3.1469047110149926e-06, + "loss": 0.78273457, + "num_input_tokens_seen": 58504175, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.8671875, + "step": 2715, + "time_per_iteration": 3.1106722354888916 + }, + { + "auxiliary_loss_clip": 0.01518473, + "auxiliary_loss_mlp": 0.01319467, + "balance_loss_clip": 1.15889025, + "balance_loss_mlp": 1.03222013, + "epoch": 0.32657969097577105, + "flos": 21034522605600.0, + "grad_norm": 1.9882983231650213, + "language_loss": 0.85475463, + "learning_rate": 3.146266460153554e-06, + "loss": 0.88313401, + "num_input_tokens_seen": 58523885, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.87695312, + "step": 2716, + "time_per_iteration": 3.206723690032959 + }, + { + "auxiliary_loss_clip": 0.01523965, + "auxiliary_loss_mlp": 0.01318678, + "balance_loss_clip": 1.16503596, + "balance_loss_mlp": 1.02990532, + "epoch": 0.32669993386641016, + "flos": 22712443234560.0, + "grad_norm": 1.8236186748348362, + "language_loss": 0.80329531, + "learning_rate": 3.145628035404404e-06, + "loss": 0.83172172, + "num_input_tokens_seen": 58543085, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.890625, + "step": 2717, + "time_per_iteration": 3.0948498249053955 + }, + { + "auxiliary_loss_clip": 0.01497196, + "auxiliary_loss_mlp": 0.01234863, + "balance_loss_clip": 1.13876343, + "balance_loss_mlp": 1.00369263, + "epoch": 0.3268201767570492, + "flos": 72112354367040.0, + "grad_norm": 0.9021679375014463, + "language_loss": 0.57390636, + "learning_rate": 3.1449894368643922e-06, + "loss": 0.60122699, + "num_input_tokens_seen": 58605400, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.3046875, + "step": 2718, + "time_per_iteration": 3.511857032775879 + }, + { + "auxiliary_loss_clip": 0.01520587, + "auxiliary_loss_mlp": 0.01308841, + "balance_loss_clip": 1.1602726, + "balance_loss_mlp": 1.02235723, + "epoch": 0.32694041964768833, + "flos": 24537488653440.0, + "grad_norm": 1.5559316439815423, + "language_loss": 0.71209991, + "learning_rate": 3.1443506646303934e-06, + "loss": 0.74039423, + "num_input_tokens_seen": 58626700, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.87109375, + "step": 2719, + "time_per_iteration": 3.013869524002075 + }, + { + "auxiliary_loss_clip": 0.0152016, + "auxiliary_loss_mlp": 0.01311206, + "balance_loss_clip": 1.16034794, + "balance_loss_mlp": 1.02472186, + "epoch": 0.32706066253832744, + "flos": 33185840256000.0, + "grad_norm": 2.973747282546887, + "language_loss": 0.66677064, + "learning_rate": 3.1437117187993086e-06, + "loss": 0.69508427, + "num_input_tokens_seen": 58649020, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.86914062, + "step": 2720, + "time_per_iteration": 3.0878961086273193 + }, + { + "auxiliary_loss_clip": 0.0152047, + "auxiliary_loss_mlp": 0.01316949, + "balance_loss_clip": 1.16267586, + "balance_loss_mlp": 1.02512479, + "epoch": 0.3271809054289665, + "flos": 24063950839680.0, + "grad_norm": 1.64442033992106, + "language_loss": 0.79834843, + "learning_rate": 3.143072599468065e-06, + "loss": 0.82672262, + "num_input_tokens_seen": 58668845, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.91796875, + "step": 2721, + "time_per_iteration": 3.021289825439453 + }, + { + "auxiliary_loss_clip": 0.01514894, + "auxiliary_loss_mlp": 0.01305308, + "balance_loss_clip": 1.15398502, + "balance_loss_mlp": 1.01939654, + "epoch": 0.3273011483196056, + "flos": 38256847960320.0, + "grad_norm": 2.2768616012421456, + "language_loss": 0.75636816, + "learning_rate": 3.1424333067336174e-06, + "loss": 0.78457022, + "num_input_tokens_seen": 58691610, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.86523438, + "step": 2722, + "time_per_iteration": 3.1900880336761475 + }, + { + "auxiliary_loss_clip": 0.01515581, + "auxiliary_loss_mlp": 0.01314948, + "balance_loss_clip": 1.15582728, + "balance_loss_mlp": 1.02598453, + "epoch": 0.3274213912102447, + "flos": 29056674093120.0, + "grad_norm": 1.7687507663202529, + "language_loss": 0.78076839, + "learning_rate": 3.141793840692945e-06, + "loss": 0.80907369, + "num_input_tokens_seen": 58712360, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.89453125, + "step": 2723, + "time_per_iteration": 3.0486018657684326 + }, + { + "auxiliary_loss_clip": 0.01519871, + "auxiliary_loss_mlp": 0.01322112, + "balance_loss_clip": 1.15979505, + "balance_loss_mlp": 1.02914357, + "epoch": 0.32754163410088377, + "flos": 29135868819840.0, + "grad_norm": 2.201487965665558, + "language_loss": 0.61159217, + "learning_rate": 3.1411542014430553e-06, + "loss": 0.64001203, + "num_input_tokens_seen": 58733440, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.9296875, + "step": 2724, + "time_per_iteration": 3.0593342781066895 + }, + { + "auxiliary_loss_clip": 0.01518542, + "auxiliary_loss_mlp": 0.01327299, + "balance_loss_clip": 1.15974152, + "balance_loss_mlp": 1.0413878, + "epoch": 0.3276618769915229, + "flos": 20633162808960.0, + "grad_norm": 1.9344164581187624, + "language_loss": 0.81869256, + "learning_rate": 3.1405143890809804e-06, + "loss": 0.84715104, + "num_input_tokens_seen": 58752735, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.86328125, + "step": 2725, + "time_per_iteration": 3.0229744911193848 + }, + { + "auxiliary_loss_clip": 0.0152429, + "auxiliary_loss_mlp": 0.01316535, + "balance_loss_clip": 1.16416836, + "balance_loss_mlp": 1.02967, + "epoch": 0.327782119882162, + "flos": 18659209976640.0, + "grad_norm": 2.012348660796834, + "language_loss": 0.69972587, + "learning_rate": 3.1398744037037796e-06, + "loss": 0.7281341, + "num_input_tokens_seen": 58772070, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.87304688, + "step": 2726, + "time_per_iteration": 3.0256755352020264 + }, + { + "auxiliary_loss_clip": 0.01517112, + "auxiliary_loss_mlp": 0.01319621, + "balance_loss_clip": 1.15732789, + "balance_loss_mlp": 1.03199267, + "epoch": 0.32790236277280105, + "flos": 21797796548160.0, + "grad_norm": 2.7212119979813565, + "language_loss": 0.83916545, + "learning_rate": 3.139234245408538e-06, + "loss": 0.86753285, + "num_input_tokens_seen": 58790950, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.87890625, + "step": 2727, + "time_per_iteration": 3.065631628036499 + }, + { + "auxiliary_loss_clip": 0.01520302, + "auxiliary_loss_mlp": 0.01320755, + "balance_loss_clip": 1.16252434, + "balance_loss_mlp": 1.03541565, + "epoch": 0.32802260566344016, + "flos": 23333674760640.0, + "grad_norm": 1.4830090391762172, + "language_loss": 0.760023, + "learning_rate": 3.1385939142923666e-06, + "loss": 0.78843355, + "num_input_tokens_seen": 58813340, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.85742188, + "step": 2728, + "time_per_iteration": 3.047889471054077 + }, + { + "auxiliary_loss_clip": 0.0151797, + "auxiliary_loss_mlp": 0.01338622, + "balance_loss_clip": 1.15792632, + "balance_loss_mlp": 1.04775107, + "epoch": 0.3281428485540792, + "flos": 24209293006080.0, + "grad_norm": 17.75969837061196, + "language_loss": 0.78130889, + "learning_rate": 3.137953410452405e-06, + "loss": 0.80987483, + "num_input_tokens_seen": 58833610, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.91015625, + "step": 2729, + "time_per_iteration": 3.0395352840423584 + }, + { + "auxiliary_loss_clip": 0.01512891, + "auxiliary_loss_mlp": 0.01323959, + "balance_loss_clip": 1.15357685, + "balance_loss_mlp": 1.03671217, + "epoch": 0.3282630914447183, + "flos": 34131702182400.0, + "grad_norm": 2.1293687844202713, + "language_loss": 0.7393769, + "learning_rate": 3.1373127339858146e-06, + "loss": 0.76774538, + "num_input_tokens_seen": 58856210, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.87695312, + "step": 2730, + "time_per_iteration": 3.193896770477295 + }, + { + "auxiliary_loss_clip": 0.01515783, + "auxiliary_loss_mlp": 0.01318895, + "balance_loss_clip": 1.15705681, + "balance_loss_mlp": 1.03489113, + "epoch": 0.32838333433535744, + "flos": 27603252429120.0, + "grad_norm": 2.5000451785027713, + "language_loss": 0.75054914, + "learning_rate": 3.136671884989787e-06, + "loss": 0.77889591, + "num_input_tokens_seen": 58876120, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.84375, + "step": 2731, + "time_per_iteration": 3.082862615585327 + }, + { + "auxiliary_loss_clip": 0.01519608, + "auxiliary_loss_mlp": 0.01323073, + "balance_loss_clip": 1.16111481, + "balance_loss_mlp": 1.03334713, + "epoch": 0.3285035772259965, + "flos": 12351162947040.0, + "grad_norm": 2.556535426480239, + "language_loss": 0.88122654, + "learning_rate": 3.1360308635615383e-06, + "loss": 0.90965337, + "num_input_tokens_seen": 58894660, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.90039062, + "step": 2732, + "time_per_iteration": 3.1086971759796143 + }, + { + "auxiliary_loss_clip": 0.01516712, + "auxiliary_loss_mlp": 0.0132493, + "balance_loss_clip": 1.15881896, + "balance_loss_mlp": 1.03177071, + "epoch": 0.3286238201166356, + "flos": 24318375487200.0, + "grad_norm": 2.3978988739959606, + "language_loss": 0.78484535, + "learning_rate": 3.135389669798311e-06, + "loss": 0.81326181, + "num_input_tokens_seen": 58912720, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.93164062, + "step": 2733, + "time_per_iteration": 4.083120822906494 + }, + { + "auxiliary_loss_clip": 0.01519839, + "auxiliary_loss_mlp": 0.01327673, + "balance_loss_clip": 1.16191244, + "balance_loss_mlp": 1.04080725, + "epoch": 0.3287440630072747, + "flos": 21394692056160.0, + "grad_norm": 2.0193335768825316, + "language_loss": 0.80244231, + "learning_rate": 3.134748303797373e-06, + "loss": 0.83091736, + "num_input_tokens_seen": 58930090, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.87304688, + "step": 2734, + "time_per_iteration": 4.036103010177612 + }, + { + "auxiliary_loss_clip": 0.01519067, + "auxiliary_loss_mlp": 0.01344721, + "balance_loss_clip": 1.16025066, + "balance_loss_mlp": 1.05098915, + "epoch": 0.32886430589791377, + "flos": 23734806988320.0, + "grad_norm": 2.3544375477537263, + "language_loss": 0.81256688, + "learning_rate": 3.1341067656560203e-06, + "loss": 0.84120476, + "num_input_tokens_seen": 58947935, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.93554688, + "step": 2735, + "time_per_iteration": 3.1233718395233154 + }, + { + "auxiliary_loss_clip": 0.01514318, + "auxiliary_loss_mlp": 0.01333643, + "balance_loss_clip": 1.1560663, + "balance_loss_mlp": 1.04468012, + "epoch": 0.3289845487885529, + "flos": 22420886554080.0, + "grad_norm": 2.0145854052519048, + "language_loss": 0.86556494, + "learning_rate": 3.133465055471572e-06, + "loss": 0.89404458, + "num_input_tokens_seen": 58967720, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.89453125, + "step": 2736, + "time_per_iteration": 3.0859036445617676 + }, + { + "auxiliary_loss_clip": 0.01520576, + "auxiliary_loss_mlp": 0.01309195, + "balance_loss_clip": 1.16139185, + "balance_loss_mlp": 1.02385569, + "epoch": 0.329104791679192, + "flos": 19684645911360.0, + "grad_norm": 6.780691347978835, + "language_loss": 0.66207182, + "learning_rate": 3.1328231733413767e-06, + "loss": 0.69036955, + "num_input_tokens_seen": 58984360, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.85742188, + "step": 2737, + "time_per_iteration": 3.9883077144622803 + }, + { + "auxiliary_loss_clip": 0.01523654, + "auxiliary_loss_mlp": 0.01312952, + "balance_loss_clip": 1.16472936, + "balance_loss_mlp": 1.0236069, + "epoch": 0.32922503456983104, + "flos": 15999053951520.0, + "grad_norm": 2.3100895280610487, + "language_loss": 0.90485907, + "learning_rate": 3.1321811193628067e-06, + "loss": 0.93322515, + "num_input_tokens_seen": 59002505, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.89648438, + "step": 2738, + "time_per_iteration": 3.8252952098846436 + }, + { + "auxiliary_loss_clip": 0.01519768, + "auxiliary_loss_mlp": 0.01319649, + "balance_loss_clip": 1.16113758, + "balance_loss_mlp": 1.0301137, + "epoch": 0.32934527746047015, + "flos": 26836527024000.0, + "grad_norm": 2.6041498457442476, + "language_loss": 0.70225912, + "learning_rate": 3.131538893633261e-06, + "loss": 0.73065329, + "num_input_tokens_seen": 59022065, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.8984375, + "step": 2739, + "time_per_iteration": 3.0117080211639404 + }, + { + "auxiliary_loss_clip": 0.01516828, + "auxiliary_loss_mlp": 0.01331129, + "balance_loss_clip": 1.15707016, + "balance_loss_mlp": 1.04369128, + "epoch": 0.32946552035110926, + "flos": 23406004490400.0, + "grad_norm": 2.212248630054083, + "language_loss": 0.77934039, + "learning_rate": 3.130896496250165e-06, + "loss": 0.8078199, + "num_input_tokens_seen": 59041890, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.88085938, + "step": 2740, + "time_per_iteration": 2.96621036529541 + }, + { + "auxiliary_loss_clip": 0.01514391, + "auxiliary_loss_mlp": 0.01320184, + "balance_loss_clip": 1.1556294, + "balance_loss_mlp": 1.03389132, + "epoch": 0.3295857632417483, + "flos": 14174160245280.0, + "grad_norm": 2.3217863774002803, + "language_loss": 0.87225592, + "learning_rate": 3.1302539273109693e-06, + "loss": 0.90060174, + "num_input_tokens_seen": 59058715, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.86523438, + "step": 2741, + "time_per_iteration": 2.906886577606201 + }, + { + "auxiliary_loss_clip": 0.01521911, + "auxiliary_loss_mlp": 0.01320688, + "balance_loss_clip": 1.163481, + "balance_loss_mlp": 1.03248799, + "epoch": 0.32970600613238743, + "flos": 22198701206880.0, + "grad_norm": 1.828013127038507, + "language_loss": 0.80500662, + "learning_rate": 3.1296111869131513e-06, + "loss": 0.83343261, + "num_input_tokens_seen": 59076140, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.88671875, + "step": 2742, + "time_per_iteration": 3.0096771717071533 + }, + { + "auxiliary_loss_clip": 0.0151436, + "auxiliary_loss_mlp": 0.01311419, + "balance_loss_clip": 1.15618968, + "balance_loss_mlp": 1.02531707, + "epoch": 0.32982624902302654, + "flos": 22055938155360.0, + "grad_norm": 4.910243903158858, + "language_loss": 0.85835809, + "learning_rate": 3.1289682751542153e-06, + "loss": 0.88661587, + "num_input_tokens_seen": 59095700, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.86523438, + "step": 2743, + "time_per_iteration": 2.9280519485473633 + }, + { + "auxiliary_loss_clip": 0.01516968, + "auxiliary_loss_mlp": 0.01308815, + "balance_loss_clip": 1.15856981, + "balance_loss_mlp": 1.02214098, + "epoch": 0.3299464919136656, + "flos": 18663647571360.0, + "grad_norm": 7.240780660026294, + "language_loss": 0.71323562, + "learning_rate": 3.1283251921316883e-06, + "loss": 0.74149346, + "num_input_tokens_seen": 59113445, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.87304688, + "step": 2744, + "time_per_iteration": 3.0692365169525146 + }, + { + "auxiliary_loss_clip": 0.01511913, + "auxiliary_loss_mlp": 0.01316936, + "balance_loss_clip": 1.15317583, + "balance_loss_mlp": 1.03293192, + "epoch": 0.3300667348043047, + "flos": 13409065751040.0, + "grad_norm": 3.0686183354144, + "language_loss": 0.80734545, + "learning_rate": 3.1276819379431277e-06, + "loss": 0.83563399, + "num_input_tokens_seen": 59131535, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.84375, + "step": 2745, + "time_per_iteration": 3.0548675060272217 + }, + { + "auxiliary_loss_clip": 0.01516899, + "auxiliary_loss_mlp": 0.0132352, + "balance_loss_clip": 1.15785432, + "balance_loss_mlp": 1.03436625, + "epoch": 0.33018697769494376, + "flos": 15744401735040.0, + "grad_norm": 2.349478126576921, + "language_loss": 0.75498503, + "learning_rate": 3.1270385126861134e-06, + "loss": 0.78338921, + "num_input_tokens_seen": 59149520, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.89453125, + "step": 2746, + "time_per_iteration": 3.13747239112854 + }, + { + "auxiliary_loss_clip": 0.01520458, + "auxiliary_loss_mlp": 0.0131768, + "balance_loss_clip": 1.1617651, + "balance_loss_mlp": 1.03043342, + "epoch": 0.3303072205855829, + "flos": 18260505151200.0, + "grad_norm": 2.6808806606800792, + "language_loss": 0.81997848, + "learning_rate": 3.1263949164582533e-06, + "loss": 0.84835982, + "num_input_tokens_seen": 59169170, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.87695312, + "step": 2747, + "time_per_iteration": 3.062366485595703 + }, + { + "auxiliary_loss_clip": 0.01508577, + "auxiliary_loss_mlp": 0.01314928, + "balance_loss_clip": 1.15005028, + "balance_loss_mlp": 1.03016055, + "epoch": 0.330427463476222, + "flos": 17751390359040.0, + "grad_norm": 4.302041871119572, + "language_loss": 0.7826823, + "learning_rate": 3.1257511493571797e-06, + "loss": 0.81091726, + "num_input_tokens_seen": 59187675, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.8515625, + "step": 2748, + "time_per_iteration": 2.9799070358276367 + }, + { + "auxiliary_loss_clip": 0.01518439, + "auxiliary_loss_mlp": 0.01320759, + "balance_loss_clip": 1.16132379, + "balance_loss_mlp": 1.03503799, + "epoch": 0.33054770636686104, + "flos": 27165064024800.0, + "grad_norm": 4.317187980396827, + "language_loss": 0.79008389, + "learning_rate": 3.125107211480552e-06, + "loss": 0.81847584, + "num_input_tokens_seen": 59207610, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.86132812, + "step": 2749, + "time_per_iteration": 3.0960235595703125 + }, + { + "auxiliary_loss_clip": 0.01514358, + "auxiliary_loss_mlp": 0.01297065, + "balance_loss_clip": 1.15557313, + "balance_loss_mlp": 1.01287031, + "epoch": 0.33066794925750015, + "flos": 20119079427840.0, + "grad_norm": 1.8943745361454607, + "language_loss": 0.79902333, + "learning_rate": 3.124463102926054e-06, + "loss": 0.82713753, + "num_input_tokens_seen": 59226945, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.84765625, + "step": 2750, + "time_per_iteration": 3.044715642929077 + }, + { + "auxiliary_loss_clip": 0.01508942, + "auxiliary_loss_mlp": 0.01252472, + "balance_loss_clip": 1.15236664, + "balance_loss_mlp": 1.01824951, + "epoch": 0.33078819214813926, + "flos": 70648957596960.0, + "grad_norm": 0.7697128404336129, + "language_loss": 0.61515427, + "learning_rate": 3.1238188237913984e-06, + "loss": 0.64276838, + "num_input_tokens_seen": 59291485, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3359375, + "step": 2751, + "time_per_iteration": 3.578416585922241 + }, + { + "auxiliary_loss_clip": 0.01516088, + "auxiliary_loss_mlp": 0.01318273, + "balance_loss_clip": 1.1586194, + "balance_loss_mlp": 1.02969134, + "epoch": 0.3309084350387783, + "flos": 21144060224640.0, + "grad_norm": 2.584112258966417, + "language_loss": 0.76090854, + "learning_rate": 3.1231743741743202e-06, + "loss": 0.78925216, + "num_input_tokens_seen": 59310990, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.88867188, + "step": 2752, + "time_per_iteration": 3.151264190673828 + }, + { + "auxiliary_loss_clip": 0.01521182, + "auxiliary_loss_mlp": 0.01300914, + "balance_loss_clip": 1.16198456, + "balance_loss_mlp": 1.01786387, + "epoch": 0.3310286779294174, + "flos": 14211064709280.0, + "grad_norm": 2.493120847022727, + "language_loss": 0.8330425, + "learning_rate": 3.122529754172582e-06, + "loss": 0.86126351, + "num_input_tokens_seen": 59327875, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.8359375, + "step": 2753, + "time_per_iteration": 3.0739006996154785 + }, + { + "auxiliary_loss_clip": 0.01518845, + "auxiliary_loss_mlp": 0.01307638, + "balance_loss_clip": 1.15979719, + "balance_loss_mlp": 1.02363372, + "epoch": 0.33114892082005654, + "flos": 20780287598880.0, + "grad_norm": 2.1416949417648152, + "language_loss": 0.7280488, + "learning_rate": 3.1218849638839736e-06, + "loss": 0.75631368, + "num_input_tokens_seen": 59347135, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.84570312, + "step": 2754, + "time_per_iteration": 3.1018989086151123 + }, + { + "auxiliary_loss_clip": 0.01520415, + "auxiliary_loss_mlp": 0.01323794, + "balance_loss_clip": 1.16110063, + "balance_loss_mlp": 1.03597486, + "epoch": 0.3312691637106956, + "flos": 17092571662080.0, + "grad_norm": 2.6066853745400063, + "language_loss": 0.78443986, + "learning_rate": 3.121240003406307e-06, + "loss": 0.81288195, + "num_input_tokens_seen": 59365985, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.8828125, + "step": 2755, + "time_per_iteration": 3.2221834659576416 + }, + { + "auxiliary_loss_clip": 0.01523458, + "auxiliary_loss_mlp": 0.01310515, + "balance_loss_clip": 1.16393542, + "balance_loss_mlp": 1.02345884, + "epoch": 0.3313894066013347, + "flos": 29458564884000.0, + "grad_norm": 2.766738256486938, + "language_loss": 0.7298671, + "learning_rate": 3.120594872837425e-06, + "loss": 0.75820684, + "num_input_tokens_seen": 59384655, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.875, + "step": 2756, + "time_per_iteration": 3.0295281410217285 + }, + { + "auxiliary_loss_clip": 0.0150822, + "auxiliary_loss_mlp": 0.01234993, + "balance_loss_clip": 1.15134799, + "balance_loss_mlp": 1.00305939, + "epoch": 0.3315096494919738, + "flos": 61425495475200.0, + "grad_norm": 0.9592685963205106, + "language_loss": 0.62326384, + "learning_rate": 3.1199495722751906e-06, + "loss": 0.65069592, + "num_input_tokens_seen": 59444185, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3125, + "step": 2757, + "time_per_iteration": 3.525005340576172 + }, + { + "auxiliary_loss_clip": 0.0151879, + "auxiliary_loss_mlp": 0.01309586, + "balance_loss_clip": 1.1594317, + "balance_loss_mlp": 1.02424693, + "epoch": 0.33162989238261287, + "flos": 21655223137440.0, + "grad_norm": 2.071275228942356, + "language_loss": 0.83465278, + "learning_rate": 3.1193041018174972e-06, + "loss": 0.8629365, + "num_input_tokens_seen": 59464900, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.859375, + "step": 2758, + "time_per_iteration": 2.9549918174743652 + }, + { + "auxiliary_loss_clip": 0.01525504, + "auxiliary_loss_mlp": 0.01316469, + "balance_loss_clip": 1.16595423, + "balance_loss_mlp": 1.0265522, + "epoch": 0.331750135273252, + "flos": 22677017968800.0, + "grad_norm": 4.517458427427514, + "language_loss": 0.94843149, + "learning_rate": 3.118658461562261e-06, + "loss": 0.97685117, + "num_input_tokens_seen": 59481000, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.90234375, + "step": 2759, + "time_per_iteration": 2.9511396884918213 + }, + { + "auxiliary_loss_clip": 0.01528088, + "auxiliary_loss_mlp": 0.01328904, + "balance_loss_clip": 1.16939008, + "balance_loss_mlp": 1.04242051, + "epoch": 0.33187037816389103, + "flos": 22749082201440.0, + "grad_norm": 1.610709113859594, + "language_loss": 0.84820306, + "learning_rate": 3.118012651607426e-06, + "loss": 0.876773, + "num_input_tokens_seen": 59502605, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.8671875, + "step": 2760, + "time_per_iteration": 2.970125198364258 + }, + { + "auxiliary_loss_clip": 0.01535589, + "auxiliary_loss_mlp": 0.01317118, + "balance_loss_clip": 1.17712796, + "balance_loss_mlp": 1.03101575, + "epoch": 0.33199062105453014, + "flos": 19205532658080.0, + "grad_norm": 5.128386032574765, + "language_loss": 0.83583164, + "learning_rate": 3.1173666720509603e-06, + "loss": 0.86435878, + "num_input_tokens_seen": 59519540, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.8671875, + "step": 2761, + "time_per_iteration": 4.630098104476929 + }, + { + "auxiliary_loss_clip": 0.01525278, + "auxiliary_loss_mlp": 0.01313206, + "balance_loss_clip": 1.16623044, + "balance_loss_mlp": 1.02672243, + "epoch": 0.33211086394516925, + "flos": 31579718362560.0, + "grad_norm": 2.085594379195805, + "language_loss": 0.68140477, + "learning_rate": 3.116720522990859e-06, + "loss": 0.70978963, + "num_input_tokens_seen": 59540415, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.8671875, + "step": 2762, + "time_per_iteration": 2.9827938079833984 + }, + { + "auxiliary_loss_clip": 0.01518457, + "auxiliary_loss_mlp": 0.01321966, + "balance_loss_clip": 1.1604315, + "balance_loss_mlp": 1.03491032, + "epoch": 0.3322311068358083, + "flos": 17934509337120.0, + "grad_norm": 2.6402319738544375, + "language_loss": 0.61873013, + "learning_rate": 3.116074204525142e-06, + "loss": 0.64713436, + "num_input_tokens_seen": 59558590, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.87304688, + "step": 2763, + "time_per_iteration": 2.9121570587158203 + }, + { + "auxiliary_loss_clip": 0.01526779, + "auxiliary_loss_mlp": 0.01314083, + "balance_loss_clip": 1.16779113, + "balance_loss_mlp": 1.02836275, + "epoch": 0.3323513497264474, + "flos": 32272558983360.0, + "grad_norm": 1.9689371363570918, + "language_loss": 0.83791482, + "learning_rate": 3.1154277167518553e-06, + "loss": 0.86632341, + "num_input_tokens_seen": 59580205, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.86132812, + "step": 2764, + "time_per_iteration": 4.015355825424194 + }, + { + "auxiliary_loss_clip": 0.01541707, + "auxiliary_loss_mlp": 0.01231888, + "balance_loss_clip": 1.18648851, + "balance_loss_mlp": 0.99766541, + "epoch": 0.33247159261708653, + "flos": 52674774675840.0, + "grad_norm": 0.816319097796636, + "language_loss": 0.59521222, + "learning_rate": 3.114781059769072e-06, + "loss": 0.62294817, + "num_input_tokens_seen": 59631530, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.3359375, + "step": 2765, + "time_per_iteration": 3.3946263790130615 + }, + { + "auxiliary_loss_clip": 0.01532055, + "auxiliary_loss_mlp": 0.01315855, + "balance_loss_clip": 1.17508388, + "balance_loss_mlp": 1.02746367, + "epoch": 0.3325918355077256, + "flos": 27127666494720.0, + "grad_norm": 15.18699112995987, + "language_loss": 0.67637432, + "learning_rate": 3.1141342336748874e-06, + "loss": 0.70485342, + "num_input_tokens_seen": 59651090, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.88671875, + "step": 2766, + "time_per_iteration": 3.9181735515594482 + }, + { + "auxiliary_loss_clip": 0.01530882, + "auxiliary_loss_mlp": 0.01320268, + "balance_loss_clip": 1.17411208, + "balance_loss_mlp": 1.03702664, + "epoch": 0.3327120783983647, + "flos": 23666952781440.0, + "grad_norm": 2.409337458314923, + "language_loss": 0.82479453, + "learning_rate": 3.1134872385674253e-06, + "loss": 0.85330606, + "num_input_tokens_seen": 59675245, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.83789062, + "step": 2767, + "time_per_iteration": 3.1324422359466553 + }, + { + "auxiliary_loss_clip": 0.01526755, + "auxiliary_loss_mlp": 0.01319101, + "balance_loss_clip": 1.16961503, + "balance_loss_mlp": 1.03128219, + "epoch": 0.3328323212890038, + "flos": 19173369214080.0, + "grad_norm": 6.195611882166057, + "language_loss": 0.85592949, + "learning_rate": 3.1128400745448353e-06, + "loss": 0.88438797, + "num_input_tokens_seen": 59694625, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.8828125, + "step": 2768, + "time_per_iteration": 3.0304620265960693 + }, + { + "auxiliary_loss_clip": 0.01529993, + "auxiliary_loss_mlp": 0.01332016, + "balance_loss_clip": 1.17371786, + "balance_loss_mlp": 1.0420996, + "epoch": 0.33295256417964286, + "flos": 37709387434080.0, + "grad_norm": 3.6633389809676014, + "language_loss": 0.6279012, + "learning_rate": 3.11219274170529e-06, + "loss": 0.65652132, + "num_input_tokens_seen": 59716435, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.90039062, + "step": 2769, + "time_per_iteration": 3.068646192550659 + }, + { + "auxiliary_loss_clip": 0.01522956, + "auxiliary_loss_mlp": 0.01308873, + "balance_loss_clip": 1.16618681, + "balance_loss_mlp": 1.02601361, + "epoch": 0.333072807070282, + "flos": 26508445161120.0, + "grad_norm": 2.4290458315150376, + "language_loss": 0.81409842, + "learning_rate": 3.1115452401469903e-06, + "loss": 0.84241676, + "num_input_tokens_seen": 59736835, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.83398438, + "step": 2770, + "time_per_iteration": 3.0313241481781006 + }, + { + "auxiliary_loss_clip": 0.01532026, + "auxiliary_loss_mlp": 0.01308408, + "balance_loss_clip": 1.17530584, + "balance_loss_mlp": 1.02573919, + "epoch": 0.3331930499609211, + "flos": 21432886077600.0, + "grad_norm": 2.05945874659623, + "language_loss": 0.86924577, + "learning_rate": 3.1108975699681613e-06, + "loss": 0.89765006, + "num_input_tokens_seen": 59754230, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.83203125, + "step": 2771, + "time_per_iteration": 2.933326244354248 + }, + { + "auxiliary_loss_clip": 0.0152786, + "auxiliary_loss_mlp": 0.01300283, + "balance_loss_clip": 1.17104495, + "balance_loss_mlp": 1.01952171, + "epoch": 0.33331329285156014, + "flos": 20661609293280.0, + "grad_norm": 3.255787048532327, + "language_loss": 0.71685016, + "learning_rate": 3.1102497312670542e-06, + "loss": 0.74513161, + "num_input_tokens_seen": 59772235, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.8125, + "step": 2772, + "time_per_iteration": 3.0177788734436035 + }, + { + "auxiliary_loss_clip": 0.01526108, + "auxiliary_loss_mlp": 0.01300697, + "balance_loss_clip": 1.16976404, + "balance_loss_mlp": 1.01860046, + "epoch": 0.33343353574219925, + "flos": 28004119159680.0, + "grad_norm": 10.472358833590198, + "language_loss": 0.81283057, + "learning_rate": 3.109601724141946e-06, + "loss": 0.84109855, + "num_input_tokens_seen": 59791230, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.82617188, + "step": 2773, + "time_per_iteration": 3.0514872074127197 + }, + { + "auxiliary_loss_clip": 0.01535755, + "auxiliary_loss_mlp": 0.0131894, + "balance_loss_clip": 1.17924023, + "balance_loss_mlp": 1.03379107, + "epoch": 0.33355377863283836, + "flos": 23767008360480.0, + "grad_norm": 2.3265788613383767, + "language_loss": 0.6832726, + "learning_rate": 3.108953548691138e-06, + "loss": 0.71181953, + "num_input_tokens_seen": 59811315, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.85351562, + "step": 2774, + "time_per_iteration": 3.0245609283447266 + }, + { + "auxiliary_loss_clip": 0.01537332, + "auxiliary_loss_mlp": 0.01312167, + "balance_loss_clip": 1.17998147, + "balance_loss_mlp": 1.02530169, + "epoch": 0.3336740215234774, + "flos": 37782741224160.0, + "grad_norm": 3.3251461623333127, + "language_loss": 0.72365135, + "learning_rate": 3.108305205012959e-06, + "loss": 0.75214636, + "num_input_tokens_seen": 59832010, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.875, + "step": 2775, + "time_per_iteration": 3.116288661956787 + }, + { + "auxiliary_loss_clip": 0.01529093, + "auxiliary_loss_mlp": 0.01309411, + "balance_loss_clip": 1.1729753, + "balance_loss_mlp": 1.02655149, + "epoch": 0.3337942644141165, + "flos": 25521354960480.0, + "grad_norm": 2.5463254258945307, + "language_loss": 0.87686211, + "learning_rate": 3.107656693205761e-06, + "loss": 0.90524715, + "num_input_tokens_seen": 59851450, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.83398438, + "step": 2776, + "time_per_iteration": 2.937220573425293 + }, + { + "auxiliary_loss_clip": 0.01538395, + "auxiliary_loss_mlp": 0.01322297, + "balance_loss_clip": 1.1810323, + "balance_loss_mlp": 1.03180778, + "epoch": 0.3339145073047556, + "flos": 25991972305920.0, + "grad_norm": 3.138597411178409, + "language_loss": 0.70462, + "learning_rate": 3.107008013367924e-06, + "loss": 0.7332269, + "num_input_tokens_seen": 59870245, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.90820312, + "step": 2777, + "time_per_iteration": 3.0603115558624268 + }, + { + "auxiliary_loss_clip": 0.01537791, + "auxiliary_loss_mlp": 0.01321898, + "balance_loss_clip": 1.1803683, + "balance_loss_mlp": 1.03465128, + "epoch": 0.3340347501953947, + "flos": 19064590158240.0, + "grad_norm": 2.7819210536384884, + "language_loss": 0.86789995, + "learning_rate": 3.1063591655978507e-06, + "loss": 0.89649683, + "num_input_tokens_seen": 59886195, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.87695312, + "step": 2778, + "time_per_iteration": 2.985037326812744 + }, + { + "auxiliary_loss_clip": 0.01536837, + "auxiliary_loss_mlp": 0.01320908, + "balance_loss_clip": 1.17971218, + "balance_loss_mlp": 1.03347087, + "epoch": 0.3341549930860338, + "flos": 18111446025120.0, + "grad_norm": 2.7342318943817343, + "language_loss": 0.80304265, + "learning_rate": 3.105710149993972e-06, + "loss": 0.8316201, + "num_input_tokens_seen": 59905525, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.87890625, + "step": 2779, + "time_per_iteration": 2.942000150680542 + }, + { + "auxiliary_loss_clip": 0.01544189, + "auxiliary_loss_mlp": 0.01323783, + "balance_loss_clip": 1.18762279, + "balance_loss_mlp": 1.03615522, + "epoch": 0.33427523597667286, + "flos": 22677207609600.0, + "grad_norm": 2.315234980415644, + "language_loss": 0.85477161, + "learning_rate": 3.1050609666547427e-06, + "loss": 0.88345134, + "num_input_tokens_seen": 59925085, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.87890625, + "step": 2780, + "time_per_iteration": 2.960592269897461 + }, + { + "auxiliary_loss_clip": 0.01545913, + "auxiliary_loss_mlp": 0.01323124, + "balance_loss_clip": 1.18956506, + "balance_loss_mlp": 1.03530502, + "epoch": 0.33439547886731197, + "flos": 22640568642720.0, + "grad_norm": 2.1156342276647826, + "language_loss": 0.77238691, + "learning_rate": 3.104411615678644e-06, + "loss": 0.80107725, + "num_input_tokens_seen": 59943935, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.88085938, + "step": 2781, + "time_per_iteration": 2.8808987140655518 + }, + { + "auxiliary_loss_clip": 0.01549155, + "auxiliary_loss_mlp": 0.01319498, + "balance_loss_clip": 1.19374704, + "balance_loss_mlp": 1.03320467, + "epoch": 0.3345157217579511, + "flos": 24098617542240.0, + "grad_norm": 3.5911173393661118, + "language_loss": 0.74051929, + "learning_rate": 3.1037620971641803e-06, + "loss": 0.76920581, + "num_input_tokens_seen": 59963725, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.86523438, + "step": 2782, + "time_per_iteration": 2.9614486694335938 + }, + { + "auxiliary_loss_clip": 0.01551999, + "auxiliary_loss_mlp": 0.01318688, + "balance_loss_clip": 1.19643521, + "balance_loss_mlp": 1.03277659, + "epoch": 0.33463596464859013, + "flos": 18991501865280.0, + "grad_norm": 2.824050548140747, + "language_loss": 0.64850551, + "learning_rate": 3.1031124112098844e-06, + "loss": 0.67721236, + "num_input_tokens_seen": 59981935, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.86328125, + "step": 2783, + "time_per_iteration": 3.063671350479126 + }, + { + "auxiliary_loss_clip": 0.01547477, + "auxiliary_loss_mlp": 0.01321237, + "balance_loss_clip": 1.19142795, + "balance_loss_mlp": 1.03418088, + "epoch": 0.33475620753922924, + "flos": 20377676172960.0, + "grad_norm": 1.851763422151178, + "language_loss": 0.72521341, + "learning_rate": 3.1024625579143127e-06, + "loss": 0.75390053, + "num_input_tokens_seen": 59999455, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.875, + "step": 2784, + "time_per_iteration": 3.009166955947876 + }, + { + "auxiliary_loss_clip": 0.01550629, + "auxiliary_loss_mlp": 0.01312745, + "balance_loss_clip": 1.19484687, + "balance_loss_mlp": 1.02664256, + "epoch": 0.33487645042986836, + "flos": 18184306749120.0, + "grad_norm": 4.402206285311121, + "language_loss": 0.72859323, + "learning_rate": 3.101812537376048e-06, + "loss": 0.75722694, + "num_input_tokens_seen": 60018475, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.86523438, + "step": 2785, + "time_per_iteration": 3.037323474884033 + }, + { + "auxiliary_loss_clip": 0.01547485, + "auxiliary_loss_mlp": 0.01316119, + "balance_loss_clip": 1.19082808, + "balance_loss_mlp": 1.02887237, + "epoch": 0.3349966933205074, + "flos": 25851219446880.0, + "grad_norm": 2.28442175333192, + "language_loss": 0.84234464, + "learning_rate": 3.1011623496936973e-06, + "loss": 0.87098062, + "num_input_tokens_seen": 60036770, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.87695312, + "step": 2786, + "time_per_iteration": 2.9878406524658203 + }, + { + "auxiliary_loss_clip": 0.01541005, + "auxiliary_loss_mlp": 0.01309898, + "balance_loss_clip": 1.18400598, + "balance_loss_mlp": 1.02493978, + "epoch": 0.3351169362111465, + "flos": 28113998132160.0, + "grad_norm": 2.099824939385194, + "language_loss": 0.70167005, + "learning_rate": 3.100511994965893e-06, + "loss": 0.73017913, + "num_input_tokens_seen": 60056725, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.85351562, + "step": 2787, + "time_per_iteration": 3.0218324661254883 + }, + { + "auxiliary_loss_clip": 0.01539155, + "auxiliary_loss_mlp": 0.01317638, + "balance_loss_clip": 1.18314636, + "balance_loss_mlp": 1.0309639, + "epoch": 0.33523717910178563, + "flos": 22675462914240.0, + "grad_norm": 1.9276095702148757, + "language_loss": 0.84520614, + "learning_rate": 3.0998614732912947e-06, + "loss": 0.87377411, + "num_input_tokens_seen": 60076100, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.87109375, + "step": 2788, + "time_per_iteration": 3.9054996967315674 + }, + { + "auxiliary_loss_clip": 0.01542894, + "auxiliary_loss_mlp": 0.01312998, + "balance_loss_clip": 1.18715024, + "balance_loss_mlp": 1.02689552, + "epoch": 0.3353574219924247, + "flos": 15671654795520.0, + "grad_norm": 3.942611794997263, + "language_loss": 0.68179524, + "learning_rate": 3.0992107847685855e-06, + "loss": 0.71035421, + "num_input_tokens_seen": 60093815, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.86523438, + "step": 2789, + "time_per_iteration": 3.099769115447998 + }, + { + "auxiliary_loss_clip": 0.01541566, + "auxiliary_loss_mlp": 0.01321713, + "balance_loss_clip": 1.18491876, + "balance_loss_mlp": 1.03351235, + "epoch": 0.3354776648830638, + "flos": 24792975289440.0, + "grad_norm": 2.764948656565006, + "language_loss": 0.79292434, + "learning_rate": 3.0985599294964736e-06, + "loss": 0.8215571, + "num_input_tokens_seen": 60113370, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.88671875, + "step": 2790, + "time_per_iteration": 3.065685272216797 + }, + { + "auxiliary_loss_clip": 0.01532538, + "auxiliary_loss_mlp": 0.01314406, + "balance_loss_clip": 1.17602062, + "balance_loss_mlp": 1.02811289, + "epoch": 0.33559790777370285, + "flos": 28697414918400.0, + "grad_norm": 2.231198069920012, + "language_loss": 0.70253277, + "learning_rate": 3.097908907573695e-06, + "loss": 0.73100221, + "num_input_tokens_seen": 60131350, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.86914062, + "step": 2791, + "time_per_iteration": 3.03195858001709 + }, + { + "auxiliary_loss_clip": 0.01535155, + "auxiliary_loss_mlp": 0.01314786, + "balance_loss_clip": 1.17809749, + "balance_loss_mlp": 1.02925611, + "epoch": 0.33571815066434196, + "flos": 22237919288640.0, + "grad_norm": 5.249712811191815, + "language_loss": 0.89523578, + "learning_rate": 3.0972577190990067e-06, + "loss": 0.92373526, + "num_input_tokens_seen": 60149830, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.859375, + "step": 2792, + "time_per_iteration": 3.801525354385376 + }, + { + "auxiliary_loss_clip": 0.01529193, + "auxiliary_loss_mlp": 0.01323802, + "balance_loss_clip": 1.17247248, + "balance_loss_mlp": 1.03636432, + "epoch": 0.3358383935549811, + "flos": 23844572176320.0, + "grad_norm": 1.838433376900735, + "language_loss": 0.79830939, + "learning_rate": 3.096606364171196e-06, + "loss": 0.82683933, + "num_input_tokens_seen": 60169620, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.87695312, + "step": 2793, + "time_per_iteration": 3.9241135120391846 + }, + { + "auxiliary_loss_clip": 0.01538585, + "auxiliary_loss_mlp": 0.01320913, + "balance_loss_clip": 1.18237329, + "balance_loss_mlp": 1.03690839, + "epoch": 0.33595863644562013, + "flos": 22269172456800.0, + "grad_norm": 2.2542818983069286, + "language_loss": 0.85009044, + "learning_rate": 3.0959548428890703e-06, + "loss": 0.87868541, + "num_input_tokens_seen": 60188490, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.84570312, + "step": 2794, + "time_per_iteration": 2.9551849365234375 + }, + { + "auxiliary_loss_clip": 0.01538495, + "auxiliary_loss_mlp": 0.013198, + "balance_loss_clip": 1.18183899, + "balance_loss_mlp": 1.03217173, + "epoch": 0.33607887933625924, + "flos": 20121810255360.0, + "grad_norm": 2.4135723820906043, + "language_loss": 0.84347117, + "learning_rate": 3.095303155351468e-06, + "loss": 0.8720541, + "num_input_tokens_seen": 60208695, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.87890625, + "step": 2795, + "time_per_iteration": 2.906466484069824 + }, + { + "auxiliary_loss_clip": 0.0153235, + "auxiliary_loss_mlp": 0.01308598, + "balance_loss_clip": 1.17470133, + "balance_loss_mlp": 1.02554762, + "epoch": 0.33619912222689835, + "flos": 19320152650560.0, + "grad_norm": 2.5311197419215796, + "language_loss": 0.79038584, + "learning_rate": 3.0946513016572464e-06, + "loss": 0.81879538, + "num_input_tokens_seen": 60227600, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.8359375, + "step": 2796, + "time_per_iteration": 2.9970312118530273 + }, + { + "auxiliary_loss_clip": 0.01530459, + "auxiliary_loss_mlp": 0.0132554, + "balance_loss_clip": 1.17265844, + "balance_loss_mlp": 1.03562319, + "epoch": 0.3363193651175374, + "flos": 16802570036160.0, + "grad_norm": 2.251451805901939, + "language_loss": 0.76796877, + "learning_rate": 3.0939992819052938e-06, + "loss": 0.79652876, + "num_input_tokens_seen": 60245110, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.90039062, + "step": 2797, + "time_per_iteration": 2.9025137424468994 + }, + { + "auxiliary_loss_clip": 0.0153134, + "auxiliary_loss_mlp": 0.01317445, + "balance_loss_clip": 1.17412376, + "balance_loss_mlp": 1.03038943, + "epoch": 0.3364396080081765, + "flos": 23552560357920.0, + "grad_norm": 2.5620332762151494, + "language_loss": 0.81531799, + "learning_rate": 3.0933470961945193e-06, + "loss": 0.84380579, + "num_input_tokens_seen": 60263405, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.875, + "step": 2798, + "time_per_iteration": 3.0245325565338135 + }, + { + "auxiliary_loss_clip": 0.01527361, + "auxiliary_loss_mlp": 0.01313543, + "balance_loss_clip": 1.16998374, + "balance_loss_mlp": 1.02820396, + "epoch": 0.3365598508988156, + "flos": 28040416773120.0, + "grad_norm": 2.488945998899906, + "language_loss": 0.68286806, + "learning_rate": 3.0926947446238597e-06, + "loss": 0.71127707, + "num_input_tokens_seen": 60282975, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.85742188, + "step": 2799, + "time_per_iteration": 3.070378065109253 + }, + { + "auxiliary_loss_clip": 0.01524631, + "auxiliary_loss_mlp": 0.01310617, + "balance_loss_clip": 1.1670053, + "balance_loss_mlp": 1.02298856, + "epoch": 0.3366800937894547, + "flos": 16984664953920.0, + "grad_norm": 3.182572937092157, + "language_loss": 0.81934893, + "learning_rate": 3.092042227292276e-06, + "loss": 0.84770143, + "num_input_tokens_seen": 60299810, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.88085938, + "step": 2800, + "time_per_iteration": 2.9543747901916504 + }, + { + "auxiliary_loss_clip": 0.01523973, + "auxiliary_loss_mlp": 0.01311588, + "balance_loss_clip": 1.16671252, + "balance_loss_mlp": 1.02910995, + "epoch": 0.3368003366800938, + "flos": 23917622541120.0, + "grad_norm": 1.6395024211089118, + "language_loss": 0.88393909, + "learning_rate": 3.0913895442987557e-06, + "loss": 0.91229469, + "num_input_tokens_seen": 60320775, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.83007812, + "step": 2801, + "time_per_iteration": 3.0234436988830566 + }, + { + "auxiliary_loss_clip": 0.01526581, + "auxiliary_loss_mlp": 0.01314368, + "balance_loss_clip": 1.1690836, + "balance_loss_mlp": 1.030936, + "epoch": 0.3369205795707329, + "flos": 24793582140000.0, + "grad_norm": 1.7489782119874935, + "language_loss": 0.85743153, + "learning_rate": 3.090736695742308e-06, + "loss": 0.88584101, + "num_input_tokens_seen": 60341905, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.83984375, + "step": 2802, + "time_per_iteration": 3.052455186843872 + }, + { + "auxiliary_loss_clip": 0.01526146, + "auxiliary_loss_mlp": 0.01316226, + "balance_loss_clip": 1.16869128, + "balance_loss_mlp": 1.03508258, + "epoch": 0.33704082246137196, + "flos": 17933030138880.0, + "grad_norm": 2.752913005163107, + "language_loss": 0.52419436, + "learning_rate": 3.0900836817219713e-06, + "loss": 0.55261815, + "num_input_tokens_seen": 60358335, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.81445312, + "step": 2803, + "time_per_iteration": 2.94852352142334 + }, + { + "auxiliary_loss_clip": 0.0152453, + "auxiliary_loss_mlp": 0.01308157, + "balance_loss_clip": 1.16645551, + "balance_loss_mlp": 1.02377093, + "epoch": 0.33716106535201107, + "flos": 21288567971520.0, + "grad_norm": 2.1969458480086774, + "language_loss": 0.83504605, + "learning_rate": 3.089430502336807e-06, + "loss": 0.86337292, + "num_input_tokens_seen": 60378305, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.84765625, + "step": 2804, + "time_per_iteration": 3.1165173053741455 + }, + { + "auxiliary_loss_clip": 0.01525965, + "auxiliary_loss_mlp": 0.01314939, + "balance_loss_clip": 1.16927695, + "balance_loss_mlp": 1.0318886, + "epoch": 0.3372813082426502, + "flos": 18404785329120.0, + "grad_norm": 2.6282183013020735, + "language_loss": 0.90634489, + "learning_rate": 3.088777157685902e-06, + "loss": 0.93475389, + "num_input_tokens_seen": 60393895, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.8359375, + "step": 2805, + "time_per_iteration": 2.9483025074005127 + }, + { + "auxiliary_loss_clip": 0.01521782, + "auxiliary_loss_mlp": 0.01317214, + "balance_loss_clip": 1.16470027, + "balance_loss_mlp": 1.03225625, + "epoch": 0.33740155113328923, + "flos": 17203285054080.0, + "grad_norm": 2.2959708478487357, + "language_loss": 0.85267818, + "learning_rate": 3.088123647868367e-06, + "loss": 0.88106811, + "num_input_tokens_seen": 60410445, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.85546875, + "step": 2806, + "time_per_iteration": 3.0863211154937744 + }, + { + "auxiliary_loss_clip": 0.01522104, + "auxiliary_loss_mlp": 0.01303735, + "balance_loss_clip": 1.16674352, + "balance_loss_mlp": 1.02182913, + "epoch": 0.33752179402392835, + "flos": 29061490969440.0, + "grad_norm": 2.5809773087177437, + "language_loss": 0.81010258, + "learning_rate": 3.0874699729833405e-06, + "loss": 0.83836102, + "num_input_tokens_seen": 60431815, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.82421875, + "step": 2807, + "time_per_iteration": 3.0456621646881104 + }, + { + "auxiliary_loss_clip": 0.01518932, + "auxiliary_loss_mlp": 0.01312598, + "balance_loss_clip": 1.16128647, + "balance_loss_mlp": 1.0263046, + "epoch": 0.3376420369145674, + "flos": 25082976915360.0, + "grad_norm": 1.93192045509405, + "language_loss": 0.79750395, + "learning_rate": 3.086816133129983e-06, + "loss": 0.82581925, + "num_input_tokens_seen": 60452075, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.8671875, + "step": 2808, + "time_per_iteration": 2.9852774143218994 + }, + { + "auxiliary_loss_clip": 0.01523631, + "auxiliary_loss_mlp": 0.01317312, + "balance_loss_clip": 1.16704166, + "balance_loss_mlp": 1.03063738, + "epoch": 0.3377622798052065, + "flos": 27493107959520.0, + "grad_norm": 2.29754972591537, + "language_loss": 0.76186925, + "learning_rate": 3.0861621284074826e-06, + "loss": 0.79027867, + "num_input_tokens_seen": 60472600, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.87109375, + "step": 2809, + "time_per_iteration": 3.038054943084717 + }, + { + "auxiliary_loss_clip": 0.01526226, + "auxiliary_loss_mlp": 0.01320495, + "balance_loss_clip": 1.16973042, + "balance_loss_mlp": 1.03763461, + "epoch": 0.3378825226958456, + "flos": 21977046853920.0, + "grad_norm": 2.839839490183443, + "language_loss": 0.72576439, + "learning_rate": 3.085507958915051e-06, + "loss": 0.75423151, + "num_input_tokens_seen": 60491030, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.83398438, + "step": 2810, + "time_per_iteration": 2.9894943237304688 + }, + { + "auxiliary_loss_clip": 0.01516652, + "auxiliary_loss_mlp": 0.01321041, + "balance_loss_clip": 1.16045797, + "balance_loss_mlp": 1.03360403, + "epoch": 0.3380027655864847, + "flos": 42526539413280.0, + "grad_norm": 2.4175074492071342, + "language_loss": 0.71497822, + "learning_rate": 3.084853624751925e-06, + "loss": 0.74335515, + "num_input_tokens_seen": 60512615, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.87695312, + "step": 2811, + "time_per_iteration": 3.2649528980255127 + }, + { + "auxiliary_loss_clip": 0.01525696, + "auxiliary_loss_mlp": 0.0132399, + "balance_loss_clip": 1.17031503, + "balance_loss_mlp": 1.03807831, + "epoch": 0.3381230084771238, + "flos": 26727558327360.0, + "grad_norm": 3.2756522639061108, + "language_loss": 0.85614789, + "learning_rate": 3.0841991260173668e-06, + "loss": 0.88464475, + "num_input_tokens_seen": 60532520, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.86328125, + "step": 2812, + "time_per_iteration": 3.065505266189575 + }, + { + "auxiliary_loss_clip": 0.01522225, + "auxiliary_loss_mlp": 0.01312058, + "balance_loss_clip": 1.16591978, + "balance_loss_mlp": 1.02767241, + "epoch": 0.3382432513677629, + "flos": 22712557019040.0, + "grad_norm": 9.30734671146375, + "language_loss": 0.80560029, + "learning_rate": 3.0835444628106634e-06, + "loss": 0.83394313, + "num_input_tokens_seen": 60551500, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.84960938, + "step": 2813, + "time_per_iteration": 3.0139548778533936 + }, + { + "auxiliary_loss_clip": 0.01516888, + "auxiliary_loss_mlp": 0.01328185, + "balance_loss_clip": 1.16079831, + "balance_loss_mlp": 1.03941226, + "epoch": 0.33836349425840195, + "flos": 22124323356480.0, + "grad_norm": 2.066645462969518, + "language_loss": 0.8301779, + "learning_rate": 3.082889635231126e-06, + "loss": 0.85862863, + "num_input_tokens_seen": 60570160, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.890625, + "step": 2814, + "time_per_iteration": 3.0584206581115723 + }, + { + "auxiliary_loss_clip": 0.01516158, + "auxiliary_loss_mlp": 0.01318055, + "balance_loss_clip": 1.16046906, + "balance_loss_mlp": 1.02909207, + "epoch": 0.33848373714904106, + "flos": 27310557903840.0, + "grad_norm": 3.4425592147822233, + "language_loss": 0.76701176, + "learning_rate": 3.0822346433780925e-06, + "loss": 0.79535383, + "num_input_tokens_seen": 60590885, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.89257812, + "step": 2815, + "time_per_iteration": 4.847989797592163 + }, + { + "auxiliary_loss_clip": 0.01517391, + "auxiliary_loss_mlp": 0.0130773, + "balance_loss_clip": 1.16148138, + "balance_loss_mlp": 1.0229634, + "epoch": 0.3386039800396802, + "flos": 25851105662400.0, + "grad_norm": 2.452319814446308, + "language_loss": 0.86972356, + "learning_rate": 3.0815794873509237e-06, + "loss": 0.89797473, + "num_input_tokens_seen": 60609170, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.85351562, + "step": 2816, + "time_per_iteration": 2.9813270568847656 + }, + { + "auxiliary_loss_clip": 0.01517854, + "auxiliary_loss_mlp": 0.01322055, + "balance_loss_clip": 1.16157746, + "balance_loss_mlp": 1.03805065, + "epoch": 0.33872422293031923, + "flos": 18882684881280.0, + "grad_norm": 2.3551479118878955, + "language_loss": 0.72707671, + "learning_rate": 3.0809241672490066e-06, + "loss": 0.75547576, + "num_input_tokens_seen": 60627340, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.84570312, + "step": 2817, + "time_per_iteration": 2.9445369243621826 + }, + { + "auxiliary_loss_clip": 0.01517496, + "auxiliary_loss_mlp": 0.01325999, + "balance_loss_clip": 1.1623919, + "balance_loss_mlp": 1.0429486, + "epoch": 0.33884446582095834, + "flos": 23149114512480.0, + "grad_norm": 1.700032099081218, + "language_loss": 0.85503817, + "learning_rate": 3.080268683171753e-06, + "loss": 0.88347304, + "num_input_tokens_seen": 60647630, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.83398438, + "step": 2818, + "time_per_iteration": 2.9908037185668945 + }, + { + "auxiliary_loss_clip": 0.01516085, + "auxiliary_loss_mlp": 0.01327161, + "balance_loss_clip": 1.15997195, + "balance_loss_mlp": 1.04163098, + "epoch": 0.33896470871159745, + "flos": 15999167736000.0, + "grad_norm": 2.746364461065849, + "language_loss": 0.89783722, + "learning_rate": 3.0796130352185985e-06, + "loss": 0.92626965, + "num_input_tokens_seen": 60664485, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.86132812, + "step": 2819, + "time_per_iteration": 3.7859280109405518 + }, + { + "auxiliary_loss_clip": 0.01513505, + "auxiliary_loss_mlp": 0.01306714, + "balance_loss_clip": 1.15825212, + "balance_loss_mlp": 1.02271008, + "epoch": 0.3390849516022365, + "flos": 34498622845440.0, + "grad_norm": 4.573483321795399, + "language_loss": 0.66469675, + "learning_rate": 3.0789572234890057e-06, + "loss": 0.69289899, + "num_input_tokens_seen": 60686125, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.84570312, + "step": 2820, + "time_per_iteration": 3.1110730171203613 + }, + { + "auxiliary_loss_clip": 0.01519683, + "auxiliary_loss_mlp": 0.01319718, + "balance_loss_clip": 1.16531384, + "balance_loss_mlp": 1.03285265, + "epoch": 0.3392051944928756, + "flos": 16182172929600.0, + "grad_norm": 2.946869447054871, + "language_loss": 0.7720865, + "learning_rate": 3.0783012480824596e-06, + "loss": 0.80048048, + "num_input_tokens_seen": 60705270, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.87304688, + "step": 2821, + "time_per_iteration": 3.88889217376709 + }, + { + "auxiliary_loss_clip": 0.01511862, + "auxiliary_loss_mlp": 0.01313128, + "balance_loss_clip": 1.15576744, + "balance_loss_mlp": 1.03160286, + "epoch": 0.33932543738351467, + "flos": 17088247851840.0, + "grad_norm": 3.790904581061963, + "language_loss": 0.74168384, + "learning_rate": 3.077645109098471e-06, + "loss": 0.7699337, + "num_input_tokens_seen": 60721540, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.8203125, + "step": 2822, + "time_per_iteration": 2.9789505004882812 + }, + { + "auxiliary_loss_clip": 0.0152078, + "auxiliary_loss_mlp": 0.01310485, + "balance_loss_clip": 1.16605926, + "balance_loss_mlp": 1.02552724, + "epoch": 0.3394456802741538, + "flos": 22129140232800.0, + "grad_norm": 1.9750571296812598, + "language_loss": 0.72047061, + "learning_rate": 3.076988806636577e-06, + "loss": 0.74878335, + "num_input_tokens_seen": 60739300, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.85351562, + "step": 2823, + "time_per_iteration": 3.1069436073303223 + }, + { + "auxiliary_loss_clip": 0.01521196, + "auxiliary_loss_mlp": 0.0131945, + "balance_loss_clip": 1.16516459, + "balance_loss_mlp": 1.03411067, + "epoch": 0.3395659231647929, + "flos": 25229153501280.0, + "grad_norm": 3.497826589579651, + "language_loss": 0.88634717, + "learning_rate": 3.0763323407963377e-06, + "loss": 0.91475368, + "num_input_tokens_seen": 60758910, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.85742188, + "step": 2824, + "time_per_iteration": 3.0865283012390137 + }, + { + "auxiliary_loss_clip": 0.015209, + "auxiliary_loss_mlp": 0.01317031, + "balance_loss_clip": 1.16515803, + "balance_loss_mlp": 1.0341711, + "epoch": 0.33968616605543195, + "flos": 29099078140320.0, + "grad_norm": 1.848115457666363, + "language_loss": 0.80279565, + "learning_rate": 3.075675711677337e-06, + "loss": 0.83117491, + "num_input_tokens_seen": 60779005, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.83398438, + "step": 2825, + "time_per_iteration": 3.056614398956299 + }, + { + "auxiliary_loss_clip": 0.0151734, + "auxiliary_loss_mlp": 0.01310099, + "balance_loss_clip": 1.16200984, + "balance_loss_mlp": 1.02838349, + "epoch": 0.33980640894607106, + "flos": 21436109971200.0, + "grad_norm": 2.02729133663857, + "language_loss": 0.78087848, + "learning_rate": 3.0750189193791865e-06, + "loss": 0.8091529, + "num_input_tokens_seen": 60798590, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.82226562, + "step": 2826, + "time_per_iteration": 2.9934263229370117 + }, + { + "auxiliary_loss_clip": 0.01515751, + "auxiliary_loss_mlp": 0.01315895, + "balance_loss_clip": 1.16144478, + "balance_loss_mlp": 1.02941072, + "epoch": 0.33992665183671017, + "flos": 32492847922560.0, + "grad_norm": 1.940837678610494, + "language_loss": 0.70118624, + "learning_rate": 3.0743619640015203e-06, + "loss": 0.72950268, + "num_input_tokens_seen": 60818840, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.8671875, + "step": 2827, + "time_per_iteration": 3.049180030822754 + }, + { + "auxiliary_loss_clip": 0.01519471, + "auxiliary_loss_mlp": 0.01311546, + "balance_loss_clip": 1.16392004, + "balance_loss_mlp": 1.02868617, + "epoch": 0.3400468947273492, + "flos": 17057298108960.0, + "grad_norm": 2.682377838289792, + "language_loss": 0.92723858, + "learning_rate": 3.073704845643999e-06, + "loss": 0.95554882, + "num_input_tokens_seen": 60835965, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.83398438, + "step": 2828, + "time_per_iteration": 2.9898855686187744 + }, + { + "auxiliary_loss_clip": 0.01516434, + "auxiliary_loss_mlp": 0.0131046, + "balance_loss_clip": 1.16168976, + "balance_loss_mlp": 1.02626455, + "epoch": 0.34016713761798834, + "flos": 16874861837760.0, + "grad_norm": 3.1787579557445165, + "language_loss": 0.78008759, + "learning_rate": 3.0730475644063063e-06, + "loss": 0.80835652, + "num_input_tokens_seen": 60851065, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.84765625, + "step": 2829, + "time_per_iteration": 3.0272693634033203 + }, + { + "auxiliary_loss_clip": 0.01514007, + "auxiliary_loss_mlp": 0.01303858, + "balance_loss_clip": 1.16000676, + "balance_loss_mlp": 1.02118921, + "epoch": 0.34028738050862745, + "flos": 21909268503360.0, + "grad_norm": 1.935041372809926, + "language_loss": 0.64720815, + "learning_rate": 3.072390120388151e-06, + "loss": 0.67538679, + "num_input_tokens_seen": 60869390, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.83203125, + "step": 2830, + "time_per_iteration": 2.989027738571167 + }, + { + "auxiliary_loss_clip": 0.01526447, + "auxiliary_loss_mlp": 0.01319293, + "balance_loss_clip": 1.17112756, + "balance_loss_mlp": 1.03376269, + "epoch": 0.3404076233992665, + "flos": 22748171925600.0, + "grad_norm": 2.6036135848278987, + "language_loss": 0.71368945, + "learning_rate": 3.071732513689267e-06, + "loss": 0.74214685, + "num_input_tokens_seen": 60887925, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.859375, + "step": 2831, + "time_per_iteration": 2.9864370822906494 + }, + { + "auxiliary_loss_clip": 0.01521533, + "auxiliary_loss_mlp": 0.01311858, + "balance_loss_clip": 1.16706681, + "balance_loss_mlp": 1.02556491, + "epoch": 0.3405278662899056, + "flos": 17054074215360.0, + "grad_norm": 3.2504870963207972, + "language_loss": 0.67364413, + "learning_rate": 3.0710747444094134e-06, + "loss": 0.70197803, + "num_input_tokens_seen": 60905955, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.8671875, + "step": 2832, + "time_per_iteration": 2.9567506313323975 + }, + { + "auxiliary_loss_clip": 0.01520525, + "auxiliary_loss_mlp": 0.01309717, + "balance_loss_clip": 1.16584456, + "balance_loss_mlp": 1.02609444, + "epoch": 0.3406481091805447, + "flos": 42816389326560.0, + "grad_norm": 1.90325383398262, + "language_loss": 0.64820278, + "learning_rate": 3.070416812648372e-06, + "loss": 0.67650521, + "num_input_tokens_seen": 60929405, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.84179688, + "step": 2833, + "time_per_iteration": 3.1420226097106934 + }, + { + "auxiliary_loss_clip": 0.01522105, + "auxiliary_loss_mlp": 0.01310985, + "balance_loss_clip": 1.16770911, + "balance_loss_mlp": 1.03060496, + "epoch": 0.3407683520711838, + "flos": 26763931797120.0, + "grad_norm": 2.664958831600771, + "language_loss": 0.65383363, + "learning_rate": 3.069758718505951e-06, + "loss": 0.68216455, + "num_input_tokens_seen": 60951145, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.80859375, + "step": 2834, + "time_per_iteration": 3.172724723815918 + }, + { + "auxiliary_loss_clip": 0.01524667, + "auxiliary_loss_mlp": 0.01311558, + "balance_loss_clip": 1.17033339, + "balance_loss_mlp": 1.02869833, + "epoch": 0.3408885949618229, + "flos": 28770199786080.0, + "grad_norm": 1.8386415035163823, + "language_loss": 0.80254889, + "learning_rate": 3.0691004620819836e-06, + "loss": 0.83091116, + "num_input_tokens_seen": 60971275, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.83398438, + "step": 2835, + "time_per_iteration": 3.0467844009399414 + }, + { + "auxiliary_loss_clip": 0.01692626, + "auxiliary_loss_mlp": 0.01252647, + "balance_loss_clip": 1.34907746, + "balance_loss_mlp": 1.0245285, + "epoch": 0.341008837852462, + "flos": 63582870710880.0, + "grad_norm": 0.8072576473272837, + "language_loss": 0.60133922, + "learning_rate": 3.0684420434763254e-06, + "loss": 0.63079202, + "num_input_tokens_seen": 61037460, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 2.28125, + "step": 2836, + "time_per_iteration": 3.48396635055542 + }, + { + "auxiliary_loss_clip": 0.0152838, + "auxiliary_loss_mlp": 0.01319287, + "balance_loss_clip": 1.17389894, + "balance_loss_mlp": 1.03509235, + "epoch": 0.34112908074310105, + "flos": 20814195738240.0, + "grad_norm": 2.5513189079857193, + "language_loss": 0.76778245, + "learning_rate": 3.06778346278886e-06, + "loss": 0.79625916, + "num_input_tokens_seen": 61056295, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.84570312, + "step": 2837, + "time_per_iteration": 2.9617788791656494 + }, + { + "auxiliary_loss_clip": 0.01526779, + "auxiliary_loss_mlp": 0.0131047, + "balance_loss_clip": 1.17136788, + "balance_loss_mlp": 1.02818227, + "epoch": 0.34124932363374016, + "flos": 24978976807680.0, + "grad_norm": 1.860687285116553, + "language_loss": 0.78861701, + "learning_rate": 3.0671247201194906e-06, + "loss": 0.81698954, + "num_input_tokens_seen": 61078430, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.82617188, + "step": 2838, + "time_per_iteration": 2.948167085647583 + }, + { + "auxiliary_loss_clip": 0.01518616, + "auxiliary_loss_mlp": 0.0132152, + "balance_loss_clip": 1.16377854, + "balance_loss_mlp": 1.03942347, + "epoch": 0.3413695665243792, + "flos": 28405516884480.0, + "grad_norm": 2.8137502517590995, + "language_loss": 0.75465071, + "learning_rate": 3.066465815568151e-06, + "loss": 0.78305209, + "num_input_tokens_seen": 61099260, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.82617188, + "step": 2839, + "time_per_iteration": 3.06130051612854 + }, + { + "auxiliary_loss_clip": 0.01518666, + "auxiliary_loss_mlp": 0.01311431, + "balance_loss_clip": 1.16422987, + "balance_loss_mlp": 1.03124166, + "epoch": 0.34148980941501833, + "flos": 25304403699360.0, + "grad_norm": 1.8889223630414527, + "language_loss": 0.68823457, + "learning_rate": 3.0658067492347947e-06, + "loss": 0.71653551, + "num_input_tokens_seen": 61121900, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.80664062, + "step": 2840, + "time_per_iteration": 2.9836745262145996 + }, + { + "auxiliary_loss_clip": 0.01524603, + "auxiliary_loss_mlp": 0.01299997, + "balance_loss_clip": 1.16996026, + "balance_loss_mlp": 1.01885378, + "epoch": 0.34161005230565744, + "flos": 17532011695680.0, + "grad_norm": 2.392837947448758, + "language_loss": 0.66825092, + "learning_rate": 3.065147521219402e-06, + "loss": 0.69649696, + "num_input_tokens_seen": 61141155, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.81640625, + "step": 2841, + "time_per_iteration": 3.207336664199829 + }, + { + "auxiliary_loss_clip": 0.01522909, + "auxiliary_loss_mlp": 0.01312862, + "balance_loss_clip": 1.17088985, + "balance_loss_mlp": 1.03381705, + "epoch": 0.3417302951962965, + "flos": 43653282556320.0, + "grad_norm": 1.6011083326467717, + "language_loss": 0.74466431, + "learning_rate": 3.064488131621977e-06, + "loss": 0.77302206, + "num_input_tokens_seen": 61164480, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.79492188, + "step": 2842, + "time_per_iteration": 4.090879678726196 + }, + { + "auxiliary_loss_clip": 0.01520606, + "auxiliary_loss_mlp": 0.01317686, + "balance_loss_clip": 1.16768432, + "balance_loss_mlp": 1.03558958, + "epoch": 0.3418505380869356, + "flos": 30884791692960.0, + "grad_norm": 2.065301685571865, + "language_loss": 0.73746991, + "learning_rate": 3.063828580542549e-06, + "loss": 0.76585281, + "num_input_tokens_seen": 61185675, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.82617188, + "step": 2843, + "time_per_iteration": 3.9729323387145996 + }, + { + "auxiliary_loss_clip": 0.01517875, + "auxiliary_loss_mlp": 0.0131375, + "balance_loss_clip": 1.16468573, + "balance_loss_mlp": 1.03298879, + "epoch": 0.3419707809775747, + "flos": 19465912026720.0, + "grad_norm": 1.975222805199739, + "language_loss": 0.7320708, + "learning_rate": 3.0631688680811706e-06, + "loss": 0.76038706, + "num_input_tokens_seen": 61205300, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.8125, + "step": 2844, + "time_per_iteration": 2.950723648071289 + }, + { + "auxiliary_loss_clip": 0.01520924, + "auxiliary_loss_mlp": 0.01306362, + "balance_loss_clip": 1.16758943, + "balance_loss_mlp": 1.02598131, + "epoch": 0.3420910238682138, + "flos": 28730185212960.0, + "grad_norm": 2.2539919781923046, + "language_loss": 0.75749075, + "learning_rate": 3.062508994337921e-06, + "loss": 0.78576362, + "num_input_tokens_seen": 61224905, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.80859375, + "step": 2845, + "time_per_iteration": 3.076080560684204 + }, + { + "auxiliary_loss_clip": 0.01516708, + "auxiliary_loss_mlp": 0.01310112, + "balance_loss_clip": 1.16310167, + "balance_loss_mlp": 1.02935028, + "epoch": 0.3422112667588529, + "flos": 21399395148000.0, + "grad_norm": 2.394205851560134, + "language_loss": 0.79256457, + "learning_rate": 3.0618489594129013e-06, + "loss": 0.82083279, + "num_input_tokens_seen": 61243045, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.8125, + "step": 2846, + "time_per_iteration": 3.849597215652466 + }, + { + "auxiliary_loss_clip": 0.01524106, + "auxiliary_loss_mlp": 0.01311401, + "balance_loss_clip": 1.17032671, + "balance_loss_mlp": 1.0281601, + "epoch": 0.342331509649492, + "flos": 13883172487200.0, + "grad_norm": 3.054675226586253, + "language_loss": 0.71474171, + "learning_rate": 3.061188763406239e-06, + "loss": 0.74309677, + "num_input_tokens_seen": 61259190, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.8359375, + "step": 2847, + "time_per_iteration": 3.01090669631958 + }, + { + "auxiliary_loss_clip": 0.01524006, + "auxiliary_loss_mlp": 0.01308377, + "balance_loss_clip": 1.17003369, + "balance_loss_mlp": 1.02704358, + "epoch": 0.34245175254013105, + "flos": 28623985272000.0, + "grad_norm": 3.4288340837498827, + "language_loss": 0.82454246, + "learning_rate": 3.060528406418085e-06, + "loss": 0.85286629, + "num_input_tokens_seen": 61279040, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.81835938, + "step": 2848, + "time_per_iteration": 3.928980827331543 + }, + { + "auxiliary_loss_clip": 0.01529737, + "auxiliary_loss_mlp": 0.0130517, + "balance_loss_clip": 1.17684364, + "balance_loss_mlp": 1.02536201, + "epoch": 0.34257199543077016, + "flos": 34129691989920.0, + "grad_norm": 2.6236629651858947, + "language_loss": 0.61827797, + "learning_rate": 3.0598678885486145e-06, + "loss": 0.64662701, + "num_input_tokens_seen": 61301580, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.80273438, + "step": 2849, + "time_per_iteration": 3.0560662746429443 + }, + { + "auxiliary_loss_clip": 0.01518865, + "auxiliary_loss_mlp": 0.01308405, + "balance_loss_clip": 1.16513371, + "balance_loss_mlp": 1.02592623, + "epoch": 0.34269223832140927, + "flos": 19976050879200.0, + "grad_norm": 3.3190620620794222, + "language_loss": 0.74515069, + "learning_rate": 3.0592072098980282e-06, + "loss": 0.77342331, + "num_input_tokens_seen": 61321240, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.83007812, + "step": 2850, + "time_per_iteration": 3.0987513065338135 + }, + { + "auxiliary_loss_clip": 0.01516569, + "auxiliary_loss_mlp": 0.01314067, + "balance_loss_clip": 1.16304505, + "balance_loss_mlp": 1.03235137, + "epoch": 0.3428124812120483, + "flos": 27238152317760.0, + "grad_norm": 2.0650242399052727, + "language_loss": 0.73005033, + "learning_rate": 3.0585463705665514e-06, + "loss": 0.75835669, + "num_input_tokens_seen": 61341615, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.82226562, + "step": 2851, + "time_per_iteration": 3.126871347427368 + }, + { + "auxiliary_loss_clip": 0.01518371, + "auxiliary_loss_mlp": 0.01315363, + "balance_loss_clip": 1.16331661, + "balance_loss_mlp": 1.03231204, + "epoch": 0.34293272410268744, + "flos": 24573141488160.0, + "grad_norm": 2.505138234217284, + "language_loss": 0.70264721, + "learning_rate": 3.0578853706544304e-06, + "loss": 0.73098457, + "num_input_tokens_seen": 61359005, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.8359375, + "step": 2852, + "time_per_iteration": 3.0607800483703613 + }, + { + "auxiliary_loss_clip": 0.01521462, + "auxiliary_loss_mlp": 0.01303583, + "balance_loss_clip": 1.16792095, + "balance_loss_mlp": 1.02110517, + "epoch": 0.34305296699332655, + "flos": 21509008623360.0, + "grad_norm": 2.427244635406559, + "language_loss": 0.65817338, + "learning_rate": 3.0572242102619404e-06, + "loss": 0.68642384, + "num_input_tokens_seen": 61376160, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.83007812, + "step": 2853, + "time_per_iteration": 2.9524729251861572 + }, + { + "auxiliary_loss_clip": 0.01518346, + "auxiliary_loss_mlp": 0.01306831, + "balance_loss_clip": 1.16417837, + "balance_loss_mlp": 1.02778625, + "epoch": 0.3431732098839656, + "flos": 24058716753600.0, + "grad_norm": 2.5083296645757933, + "language_loss": 0.81052887, + "learning_rate": 3.0565628894893784e-06, + "loss": 0.83878064, + "num_input_tokens_seen": 61396795, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.79492188, + "step": 2854, + "time_per_iteration": 3.0743494033813477 + }, + { + "auxiliary_loss_clip": 0.01520124, + "auxiliary_loss_mlp": 0.01318653, + "balance_loss_clip": 1.16652095, + "balance_loss_mlp": 1.03350496, + "epoch": 0.3432934527746047, + "flos": 16802797605120.0, + "grad_norm": 1.9125098804423655, + "language_loss": 0.74518037, + "learning_rate": 3.0559014084370655e-06, + "loss": 0.77356809, + "num_input_tokens_seen": 61415320, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.85742188, + "step": 2855, + "time_per_iteration": 2.9248883724212646 + }, + { + "auxiliary_loss_clip": 0.01517089, + "auxiliary_loss_mlp": 0.01311908, + "balance_loss_clip": 1.1637733, + "balance_loss_mlp": 1.02656901, + "epoch": 0.34341369566524377, + "flos": 23441922822240.0, + "grad_norm": 1.8592668099873642, + "language_loss": 0.78596538, + "learning_rate": 3.055239767205349e-06, + "loss": 0.8142553, + "num_input_tokens_seen": 61437070, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.85742188, + "step": 2856, + "time_per_iteration": 3.106980323791504 + }, + { + "auxiliary_loss_clip": 0.01524219, + "auxiliary_loss_mlp": 0.01314087, + "balance_loss_clip": 1.17118073, + "balance_loss_mlp": 1.02989233, + "epoch": 0.3435339385558829, + "flos": 17268977355840.0, + "grad_norm": 2.2103384425249346, + "language_loss": 0.78408861, + "learning_rate": 3.054577965894599e-06, + "loss": 0.81247163, + "num_input_tokens_seen": 61453215, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.84570312, + "step": 2857, + "time_per_iteration": 3.1035497188568115 + }, + { + "auxiliary_loss_clip": 0.01520261, + "auxiliary_loss_mlp": 0.01321599, + "balance_loss_clip": 1.16699696, + "balance_loss_mlp": 1.03587818, + "epoch": 0.343654181446522, + "flos": 22201507890720.0, + "grad_norm": 1.6846297196685507, + "language_loss": 0.70247006, + "learning_rate": 3.0539160046052094e-06, + "loss": 0.73088861, + "num_input_tokens_seen": 61472915, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.86328125, + "step": 2858, + "time_per_iteration": 3.0260841846466064 + }, + { + "auxiliary_loss_clip": 0.01516373, + "auxiliary_loss_mlp": 0.01313726, + "balance_loss_clip": 1.1638087, + "balance_loss_mlp": 1.02647901, + "epoch": 0.34377442433716104, + "flos": 19903872862080.0, + "grad_norm": 9.828535558520022, + "language_loss": 0.70381045, + "learning_rate": 3.0532538834376003e-06, + "loss": 0.73211139, + "num_input_tokens_seen": 61492475, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.875, + "step": 2859, + "time_per_iteration": 3.0552611351013184 + }, + { + "auxiliary_loss_clip": 0.015213, + "auxiliary_loss_mlp": 0.01318681, + "balance_loss_clip": 1.16865444, + "balance_loss_mlp": 1.03315043, + "epoch": 0.34389466722780015, + "flos": 22199725267200.0, + "grad_norm": 2.131279894670838, + "language_loss": 0.78238362, + "learning_rate": 3.0525916024922143e-06, + "loss": 0.81078339, + "num_input_tokens_seen": 61511660, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.859375, + "step": 2860, + "time_per_iteration": 2.9751434326171875 + }, + { + "auxiliary_loss_clip": 0.0152967, + "auxiliary_loss_mlp": 0.01314919, + "balance_loss_clip": 1.17640233, + "balance_loss_mlp": 1.03205895, + "epoch": 0.34401491011843927, + "flos": 18626553466560.0, + "grad_norm": 4.132131099721576, + "language_loss": 0.84451604, + "learning_rate": 3.0519291618695193e-06, + "loss": 0.87296188, + "num_input_tokens_seen": 61529060, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.83398438, + "step": 2861, + "time_per_iteration": 2.981651544570923 + }, + { + "auxiliary_loss_clip": 0.01520649, + "auxiliary_loss_mlp": 0.01315576, + "balance_loss_clip": 1.16896355, + "balance_loss_mlp": 1.03233492, + "epoch": 0.3441351530090783, + "flos": 17860131486720.0, + "grad_norm": 1.665225576907177, + "language_loss": 0.7564919, + "learning_rate": 3.0512665616700065e-06, + "loss": 0.78485423, + "num_input_tokens_seen": 61548125, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.83789062, + "step": 2862, + "time_per_iteration": 3.067929983139038 + }, + { + "auxiliary_loss_clip": 0.01523321, + "auxiliary_loss_mlp": 0.01308329, + "balance_loss_clip": 1.17071009, + "balance_loss_mlp": 1.02794862, + "epoch": 0.34425539589971743, + "flos": 23115016732320.0, + "grad_norm": 5.923681367446275, + "language_loss": 0.88886398, + "learning_rate": 3.0506038019941933e-06, + "loss": 0.91718054, + "num_input_tokens_seen": 61568135, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.80859375, + "step": 2863, + "time_per_iteration": 3.0155742168426514 + }, + { + "auxiliary_loss_clip": 0.01531637, + "auxiliary_loss_mlp": 0.01320208, + "balance_loss_clip": 1.1787138, + "balance_loss_mlp": 1.03219843, + "epoch": 0.34437563879035654, + "flos": 21909837425760.0, + "grad_norm": 6.634995901128074, + "language_loss": 0.67538512, + "learning_rate": 3.049940882942617e-06, + "loss": 0.70390362, + "num_input_tokens_seen": 61586920, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.8828125, + "step": 2864, + "time_per_iteration": 3.0809295177459717 + }, + { + "auxiliary_loss_clip": 0.01525399, + "auxiliary_loss_mlp": 0.01304925, + "balance_loss_clip": 1.17333269, + "balance_loss_mlp": 1.02130198, + "epoch": 0.3444958816809956, + "flos": 23079212184960.0, + "grad_norm": 2.1043950118814725, + "language_loss": 0.80132091, + "learning_rate": 3.0492778046158448e-06, + "loss": 0.82962412, + "num_input_tokens_seen": 61608340, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.84179688, + "step": 2865, + "time_per_iteration": 2.9641499519348145 + }, + { + "auxiliary_loss_clip": 0.01539713, + "auxiliary_loss_mlp": 0.01330938, + "balance_loss_clip": 1.18681085, + "balance_loss_mlp": 1.0450263, + "epoch": 0.3446161245716347, + "flos": 21910823557920.0, + "grad_norm": 2.361705304832691, + "language_loss": 0.76827341, + "learning_rate": 3.0486145671144633e-06, + "loss": 0.7969799, + "num_input_tokens_seen": 61628130, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.86523438, + "step": 2866, + "time_per_iteration": 3.0223965644836426 + }, + { + "auxiliary_loss_clip": 0.01534256, + "auxiliary_loss_mlp": 0.01313062, + "balance_loss_clip": 1.18251586, + "balance_loss_mlp": 1.02715075, + "epoch": 0.3447363674622738, + "flos": 25114836934080.0, + "grad_norm": 2.806196291843946, + "language_loss": 0.7705071, + "learning_rate": 3.047951170539086e-06, + "loss": 0.79898024, + "num_input_tokens_seen": 61647755, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.86523438, + "step": 2867, + "time_per_iteration": 3.0188279151916504 + }, + { + "auxiliary_loss_clip": 0.0154187, + "auxiliary_loss_mlp": 0.01302073, + "balance_loss_clip": 1.19154978, + "balance_loss_mlp": 1.02321887, + "epoch": 0.3448566103529129, + "flos": 11986328332800.0, + "grad_norm": 1.98037682566126, + "language_loss": 0.83958679, + "learning_rate": 3.047287614990349e-06, + "loss": 0.86802626, + "num_input_tokens_seen": 61665675, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 2.79296875, + "step": 2868, + "time_per_iteration": 3.0955047607421875 + }, + { + "auxiliary_loss_clip": 0.01545535, + "auxiliary_loss_mlp": 0.01315623, + "balance_loss_clip": 1.19364333, + "balance_loss_mlp": 1.02875793, + "epoch": 0.344976853243552, + "flos": 40191089644800.0, + "grad_norm": 3.170915133539988, + "language_loss": 0.6247353, + "learning_rate": 3.046623900568914e-06, + "loss": 0.6533469, + "num_input_tokens_seen": 61688240, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.87304688, + "step": 2869, + "time_per_iteration": 3.9483728408813477 + }, + { + "auxiliary_loss_clip": 0.01540187, + "auxiliary_loss_mlp": 0.01304689, + "balance_loss_clip": 1.18839216, + "balance_loss_mlp": 1.01934969, + "epoch": 0.34509709613419104, + "flos": 28725785546400.0, + "grad_norm": 3.688305933707378, + "language_loss": 0.70090377, + "learning_rate": 3.045960027375465e-06, + "loss": 0.72935247, + "num_input_tokens_seen": 61706075, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.85546875, + "step": 2870, + "time_per_iteration": 4.030401229858398 + }, + { + "auxiliary_loss_clip": 0.01541461, + "auxiliary_loss_mlp": 0.01319176, + "balance_loss_clip": 1.18941236, + "balance_loss_mlp": 1.03192902, + "epoch": 0.34521733902483015, + "flos": 29969993293920.0, + "grad_norm": 3.4660859252387586, + "language_loss": 0.82835519, + "learning_rate": 3.045295995510711e-06, + "loss": 0.85696155, + "num_input_tokens_seen": 61723045, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.87695312, + "step": 2871, + "time_per_iteration": 3.040565013885498 + }, + { + "auxiliary_loss_clip": 0.01545942, + "auxiliary_loss_mlp": 0.01297998, + "balance_loss_clip": 1.19476223, + "balance_loss_mlp": 1.01704574, + "epoch": 0.34533758191546926, + "flos": 27925265786400.0, + "grad_norm": 3.204001844634173, + "language_loss": 0.74022233, + "learning_rate": 3.0446318050753865e-06, + "loss": 0.76866168, + "num_input_tokens_seen": 61743525, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.81445312, + "step": 2872, + "time_per_iteration": 2.995987892150879 + }, + { + "auxiliary_loss_clip": 0.01539939, + "auxiliary_loss_mlp": 0.01310556, + "balance_loss_clip": 1.18785536, + "balance_loss_mlp": 1.02826881, + "epoch": 0.3454578248061083, + "flos": 27128349201600.0, + "grad_norm": 2.390827365913928, + "language_loss": 0.77563757, + "learning_rate": 3.0439674561702474e-06, + "loss": 0.80414248, + "num_input_tokens_seen": 61763025, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.82617188, + "step": 2873, + "time_per_iteration": 3.8578853607177734 + }, + { + "auxiliary_loss_clip": 0.01534058, + "auxiliary_loss_mlp": 0.01318037, + "balance_loss_clip": 1.18077767, + "balance_loss_mlp": 1.03594065, + "epoch": 0.3455780676967474, + "flos": 19026851274720.0, + "grad_norm": 3.0074117423986726, + "language_loss": 0.87796313, + "learning_rate": 3.043302948896076e-06, + "loss": 0.90648413, + "num_input_tokens_seen": 61781630, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.82617188, + "step": 2874, + "time_per_iteration": 2.9782700538635254 + }, + { + "auxiliary_loss_clip": 0.01538742, + "auxiliary_loss_mlp": 0.01325557, + "balance_loss_clip": 1.18678188, + "balance_loss_mlp": 1.03792906, + "epoch": 0.34569831058738654, + "flos": 34498888342560.0, + "grad_norm": 5.149782984307426, + "language_loss": 0.60750675, + "learning_rate": 3.0426382833536756e-06, + "loss": 0.63614976, + "num_input_tokens_seen": 61804985, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.87890625, + "step": 2875, + "time_per_iteration": 3.044961452484131 + }, + { + "auxiliary_loss_clip": 0.015361, + "auxiliary_loss_mlp": 0.0131058, + "balance_loss_clip": 1.18510151, + "balance_loss_mlp": 1.02714837, + "epoch": 0.3458185534780256, + "flos": 31141340317440.0, + "grad_norm": 3.8983052008487427, + "language_loss": 0.77818942, + "learning_rate": 3.041973459643877e-06, + "loss": 0.80665618, + "num_input_tokens_seen": 61824440, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.83789062, + "step": 2876, + "time_per_iteration": 3.8984193801879883 + }, + { + "auxiliary_loss_clip": 0.01539655, + "auxiliary_loss_mlp": 0.01326141, + "balance_loss_clip": 1.18887043, + "balance_loss_mlp": 1.04118276, + "epoch": 0.3459387963686647, + "flos": 32455260751680.0, + "grad_norm": 2.194792355330141, + "language_loss": 0.67088962, + "learning_rate": 3.0413084778675334e-06, + "loss": 0.69954753, + "num_input_tokens_seen": 61845690, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.85546875, + "step": 2877, + "time_per_iteration": 2.994948148727417 + }, + { + "auxiliary_loss_clip": 0.01533091, + "auxiliary_loss_mlp": 0.01323215, + "balance_loss_clip": 1.1807282, + "balance_loss_mlp": 1.04073715, + "epoch": 0.3460590392593038, + "flos": 24677672590080.0, + "grad_norm": 2.222493749192025, + "language_loss": 0.84065127, + "learning_rate": 3.0406433381255214e-06, + "loss": 0.86921436, + "num_input_tokens_seen": 61863725, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.83007812, + "step": 2878, + "time_per_iteration": 3.018655300140381 + }, + { + "auxiliary_loss_clip": 0.01546247, + "auxiliary_loss_mlp": 0.01322757, + "balance_loss_clip": 1.19414544, + "balance_loss_mlp": 1.03760874, + "epoch": 0.34617928214994287, + "flos": 18809331091200.0, + "grad_norm": 2.6044666913077474, + "language_loss": 0.81991184, + "learning_rate": 3.0399780405187425e-06, + "loss": 0.84860194, + "num_input_tokens_seen": 61882720, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.85546875, + "step": 2879, + "time_per_iteration": 3.0530784130096436 + }, + { + "auxiliary_loss_clip": 0.0153753, + "auxiliary_loss_mlp": 0.01319068, + "balance_loss_clip": 1.18631089, + "balance_loss_mlp": 1.03659022, + "epoch": 0.346299525040582, + "flos": 24859653723360.0, + "grad_norm": 2.6080184029840963, + "language_loss": 0.78647721, + "learning_rate": 3.0393125851481216e-06, + "loss": 0.81504315, + "num_input_tokens_seen": 61902595, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.83007812, + "step": 2880, + "time_per_iteration": 3.0716559886932373 + }, + { + "auxiliary_loss_clip": 0.01537493, + "auxiliary_loss_mlp": 0.0131738, + "balance_loss_clip": 1.18487573, + "balance_loss_mlp": 1.0354743, + "epoch": 0.3464197679312211, + "flos": 16436749289760.0, + "grad_norm": 2.3267411723368805, + "language_loss": 0.86250734, + "learning_rate": 3.038646972114608e-06, + "loss": 0.89105606, + "num_input_tokens_seen": 61918920, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.82421875, + "step": 2881, + "time_per_iteration": 3.2182958126068115 + }, + { + "auxiliary_loss_clip": 0.01546853, + "auxiliary_loss_mlp": 0.01311546, + "balance_loss_clip": 1.19505167, + "balance_loss_mlp": 1.03097534, + "epoch": 0.34654001082186014, + "flos": 22384664796960.0, + "grad_norm": 2.2765756607516074, + "language_loss": 0.67429596, + "learning_rate": 3.037981201519174e-06, + "loss": 0.70288002, + "num_input_tokens_seen": 61939520, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.81054688, + "step": 2882, + "time_per_iteration": 3.1984293460845947 + }, + { + "auxiliary_loss_clip": 0.01536795, + "auxiliary_loss_mlp": 0.01313966, + "balance_loss_clip": 1.18501806, + "balance_loss_mlp": 1.03320432, + "epoch": 0.34666025371249926, + "flos": 19575980640000.0, + "grad_norm": 8.69975119349105, + "language_loss": 0.71246868, + "learning_rate": 3.0373152734628175e-06, + "loss": 0.74097627, + "num_input_tokens_seen": 61957800, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.8125, + "step": 2883, + "time_per_iteration": 3.0032780170440674 + }, + { + "auxiliary_loss_clip": 0.01530066, + "auxiliary_loss_mlp": 0.01304087, + "balance_loss_clip": 1.17703736, + "balance_loss_mlp": 1.02427864, + "epoch": 0.34678049660313837, + "flos": 15269422651200.0, + "grad_norm": 1.976679297856648, + "language_loss": 0.76026207, + "learning_rate": 3.0366491880465584e-06, + "loss": 0.7886036, + "num_input_tokens_seen": 61975820, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.80273438, + "step": 2884, + "time_per_iteration": 3.0416901111602783 + }, + { + "auxiliary_loss_clip": 0.01537282, + "auxiliary_loss_mlp": 0.01319465, + "balance_loss_clip": 1.18458641, + "balance_loss_mlp": 1.03546071, + "epoch": 0.3469007394937774, + "flos": 21183847228800.0, + "grad_norm": 1.812940631048701, + "language_loss": 0.82250935, + "learning_rate": 3.035982945371443e-06, + "loss": 0.85107684, + "num_input_tokens_seen": 61997515, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.84570312, + "step": 2885, + "time_per_iteration": 2.994595527648926 + }, + { + "auxiliary_loss_clip": 0.01538634, + "auxiliary_loss_mlp": 0.0131557, + "balance_loss_clip": 1.18727231, + "balance_loss_mlp": 1.03461802, + "epoch": 0.34702098238441653, + "flos": 22377268805760.0, + "grad_norm": 2.548126370398348, + "language_loss": 0.85370952, + "learning_rate": 3.035316545538537e-06, + "loss": 0.88225162, + "num_input_tokens_seen": 62016310, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.81445312, + "step": 2886, + "time_per_iteration": 2.9925801753997803 + }, + { + "auxiliary_loss_clip": 0.01533606, + "auxiliary_loss_mlp": 0.01312902, + "balance_loss_clip": 1.18042517, + "balance_loss_mlp": 1.02927923, + "epoch": 0.3471412252750556, + "flos": 22931632257120.0, + "grad_norm": 2.3841146529885715, + "language_loss": 0.79918313, + "learning_rate": 3.034649988648935e-06, + "loss": 0.82764828, + "num_input_tokens_seen": 62036075, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.84179688, + "step": 2887, + "time_per_iteration": 2.9970810413360596 + }, + { + "auxiliary_loss_clip": 0.01534206, + "auxiliary_loss_mlp": 0.01315701, + "balance_loss_clip": 1.18245184, + "balance_loss_mlp": 1.03646493, + "epoch": 0.3472614681656947, + "flos": 21326382711360.0, + "grad_norm": 1.9614533321038634, + "language_loss": 0.80346358, + "learning_rate": 3.033983274803752e-06, + "loss": 0.8319627, + "num_input_tokens_seen": 62055865, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.796875, + "step": 2888, + "time_per_iteration": 2.951417922973633 + }, + { + "auxiliary_loss_clip": 0.0153911, + "auxiliary_loss_mlp": 0.01326197, + "balance_loss_clip": 1.18777323, + "balance_loss_mlp": 1.04390955, + "epoch": 0.3473817110563338, + "flos": 23477917010400.0, + "grad_norm": 2.351612000267413, + "language_loss": 0.72543001, + "learning_rate": 3.0333164041041283e-06, + "loss": 0.75408304, + "num_input_tokens_seen": 62072180, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.828125, + "step": 2889, + "time_per_iteration": 3.001909017562866 + }, + { + "auxiliary_loss_clip": 0.01539718, + "auxiliary_loss_mlp": 0.0131184, + "balance_loss_clip": 1.18782473, + "balance_loss_mlp": 1.03260422, + "epoch": 0.34750195394697286, + "flos": 22348215470880.0, + "grad_norm": 1.9634535327285891, + "language_loss": 0.72161531, + "learning_rate": 3.032649376651228e-06, + "loss": 0.75013095, + "num_input_tokens_seen": 62091600, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.796875, + "step": 2890, + "time_per_iteration": 3.0153422355651855 + }, + { + "auxiliary_loss_clip": 0.01544636, + "auxiliary_loss_mlp": 0.01324109, + "balance_loss_clip": 1.19252408, + "balance_loss_mlp": 1.03953254, + "epoch": 0.347622196837612, + "flos": 29097523085760.0, + "grad_norm": 1.7580155042656451, + "language_loss": 0.75821638, + "learning_rate": 3.031982192546238e-06, + "loss": 0.78690386, + "num_input_tokens_seen": 62114695, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.84960938, + "step": 2891, + "time_per_iteration": 3.0397467613220215 + }, + { + "auxiliary_loss_clip": 0.01531675, + "auxiliary_loss_mlp": 0.01311337, + "balance_loss_clip": 1.17905581, + "balance_loss_mlp": 1.02924037, + "epoch": 0.3477424397282511, + "flos": 22457942730720.0, + "grad_norm": 2.782680359372551, + "language_loss": 0.94985938, + "learning_rate": 3.0313148518903696e-06, + "loss": 0.97828948, + "num_input_tokens_seen": 62134520, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.82617188, + "step": 2892, + "time_per_iteration": 3.051372766494751 + }, + { + "auxiliary_loss_clip": 0.01539156, + "auxiliary_loss_mlp": 0.01312788, + "balance_loss_clip": 1.18756127, + "balance_loss_mlp": 1.03164482, + "epoch": 0.34786268261889014, + "flos": 15780623492160.0, + "grad_norm": 3.415378638803517, + "language_loss": 0.81319606, + "learning_rate": 3.030647354784859e-06, + "loss": 0.84171551, + "num_input_tokens_seen": 62151560, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.81640625, + "step": 2893, + "time_per_iteration": 3.1414260864257812 + }, + { + "auxiliary_loss_clip": 0.01536615, + "auxiliary_loss_mlp": 0.01302324, + "balance_loss_clip": 1.18589997, + "balance_loss_mlp": 1.02289772, + "epoch": 0.34798292550952925, + "flos": 20779756604640.0, + "grad_norm": 1.972363383763592, + "language_loss": 0.77664089, + "learning_rate": 3.029979701330964e-06, + "loss": 0.80503023, + "num_input_tokens_seen": 62170985, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 2.79882812, + "step": 2894, + "time_per_iteration": 3.002338171005249 + }, + { + "auxiliary_loss_clip": 0.01538779, + "auxiliary_loss_mlp": 0.01302026, + "balance_loss_clip": 1.18567801, + "balance_loss_mlp": 1.02259946, + "epoch": 0.34810316840016836, + "flos": 19939867050240.0, + "grad_norm": 2.532138086618881, + "language_loss": 0.80328667, + "learning_rate": 3.029311891629966e-06, + "loss": 0.83169472, + "num_input_tokens_seen": 62189440, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.79882812, + "step": 2895, + "time_per_iteration": 2.9429852962493896 + }, + { + "auxiliary_loss_clip": 0.01533885, + "auxiliary_loss_mlp": 0.01315702, + "balance_loss_clip": 1.18125665, + "balance_loss_mlp": 1.03322411, + "epoch": 0.3482234112908074, + "flos": 23625572794560.0, + "grad_norm": 1.9560321532014968, + "language_loss": 0.74568427, + "learning_rate": 3.0286439257831744e-06, + "loss": 0.77418011, + "num_input_tokens_seen": 62208910, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.83007812, + "step": 2896, + "time_per_iteration": 3.904125928878784 + }, + { + "auxiliary_loss_clip": 0.01535052, + "auxiliary_loss_mlp": 0.01318486, + "balance_loss_clip": 1.18319345, + "balance_loss_mlp": 1.03486323, + "epoch": 0.3483436541814465, + "flos": 23988928210560.0, + "grad_norm": 1.9652252833736616, + "language_loss": 0.71729994, + "learning_rate": 3.0279758038919156e-06, + "loss": 0.7458353, + "num_input_tokens_seen": 62227135, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.84179688, + "step": 2897, + "time_per_iteration": 4.075021028518677 + }, + { + "auxiliary_loss_clip": 0.01542282, + "auxiliary_loss_mlp": 0.01297977, + "balance_loss_clip": 1.18998432, + "balance_loss_mlp": 1.01530766, + "epoch": 0.34846389707208564, + "flos": 22640568642720.0, + "grad_norm": 2.0137055232380594, + "language_loss": 0.78437835, + "learning_rate": 3.0273075260575455e-06, + "loss": 0.81278098, + "num_input_tokens_seen": 62246035, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.83203125, + "step": 2898, + "time_per_iteration": 3.1466283798217773 + }, + { + "auxiliary_loss_clip": 0.01537476, + "auxiliary_loss_mlp": 0.01314733, + "balance_loss_clip": 1.18591964, + "balance_loss_mlp": 1.03168225, + "epoch": 0.3485841399627247, + "flos": 21794686439040.0, + "grad_norm": 2.577926439475327, + "language_loss": 0.80974418, + "learning_rate": 3.0266390923814396e-06, + "loss": 0.83826625, + "num_input_tokens_seen": 62264095, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.83398438, + "step": 2899, + "time_per_iteration": 3.1476316452026367 + }, + { + "auxiliary_loss_clip": 0.01546094, + "auxiliary_loss_mlp": 0.01315269, + "balance_loss_clip": 1.19234824, + "balance_loss_mlp": 1.03450727, + "epoch": 0.3487043828533638, + "flos": 17020924639200.0, + "grad_norm": 5.387694854310621, + "language_loss": 0.82203412, + "learning_rate": 3.0259705029650008e-06, + "loss": 0.85064781, + "num_input_tokens_seen": 62282025, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.81054688, + "step": 2900, + "time_per_iteration": 3.1573362350463867 + }, + { + "auxiliary_loss_clip": 0.01538852, + "auxiliary_loss_mlp": 0.01298954, + "balance_loss_clip": 1.18738103, + "balance_loss_mlp": 1.0193367, + "epoch": 0.34882462574400286, + "flos": 22603322825280.0, + "grad_norm": 1.7246702645909135, + "language_loss": 0.73123693, + "learning_rate": 3.025301757909652e-06, + "loss": 0.75961506, + "num_input_tokens_seen": 62302220, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.80078125, + "step": 2901, + "time_per_iteration": 3.8765857219696045 + }, + { + "auxiliary_loss_clip": 0.01531135, + "auxiliary_loss_mlp": 0.01315695, + "balance_loss_clip": 1.1771307, + "balance_loss_mlp": 1.03016448, + "epoch": 0.34894486863464197, + "flos": 29864096778240.0, + "grad_norm": 1.5259092986424547, + "language_loss": 0.80270195, + "learning_rate": 3.024632857316842e-06, + "loss": 0.8311702, + "num_input_tokens_seen": 62323535, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.859375, + "step": 2902, + "time_per_iteration": 3.05523681640625 + }, + { + "auxiliary_loss_clip": 0.01541023, + "auxiliary_loss_mlp": 0.01313333, + "balance_loss_clip": 1.18814802, + "balance_loss_mlp": 1.03066397, + "epoch": 0.3490651115252811, + "flos": 22124171643840.0, + "grad_norm": 3.2045272126802837, + "language_loss": 0.77435088, + "learning_rate": 3.0239638012880412e-06, + "loss": 0.80289441, + "num_input_tokens_seen": 62343430, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.83203125, + "step": 2903, + "time_per_iteration": 3.160141706466675 + }, + { + "auxiliary_loss_clip": 0.01539354, + "auxiliary_loss_mlp": 0.01304845, + "balance_loss_clip": 1.1862421, + "balance_loss_mlp": 1.02122188, + "epoch": 0.34918535441592014, + "flos": 12678296605920.0, + "grad_norm": 2.74509303188296, + "language_loss": 0.81181681, + "learning_rate": 3.0232945899247466e-06, + "loss": 0.84025884, + "num_input_tokens_seen": 62360365, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.83984375, + "step": 2904, + "time_per_iteration": 3.887587547302246 + }, + { + "auxiliary_loss_clip": 0.01546447, + "auxiliary_loss_mlp": 0.01320987, + "balance_loss_clip": 1.19492841, + "balance_loss_mlp": 1.03831756, + "epoch": 0.34930559730655925, + "flos": 23187877456320.0, + "grad_norm": 2.377440363358167, + "language_loss": 0.77315855, + "learning_rate": 3.022625223328476e-06, + "loss": 0.80183291, + "num_input_tokens_seen": 62382105, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.83203125, + "step": 2905, + "time_per_iteration": 3.118455648422241 + }, + { + "auxiliary_loss_clip": 0.0154728, + "auxiliary_loss_mlp": 0.01321953, + "balance_loss_clip": 1.19564986, + "balance_loss_mlp": 1.03604126, + "epoch": 0.34942584019719836, + "flos": 22857140622240.0, + "grad_norm": 1.5188809180234377, + "language_loss": 0.69514269, + "learning_rate": 3.0219557016007723e-06, + "loss": 0.72383499, + "num_input_tokens_seen": 62402235, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.86132812, + "step": 2906, + "time_per_iteration": 3.0562729835510254 + }, + { + "auxiliary_loss_clip": 0.01549511, + "auxiliary_loss_mlp": 0.01307549, + "balance_loss_clip": 1.19775009, + "balance_loss_mlp": 1.02793169, + "epoch": 0.3495460830878374, + "flos": 24428368244160.0, + "grad_norm": 2.1181359530003943, + "language_loss": 0.69919103, + "learning_rate": 3.021286024843202e-06, + "loss": 0.72776163, + "num_input_tokens_seen": 62420430, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.80078125, + "step": 2907, + "time_per_iteration": 3.013929843902588 + }, + { + "auxiliary_loss_clip": 0.01629252, + "auxiliary_loss_mlp": 0.01221252, + "balance_loss_clip": 1.28149986, + "balance_loss_mlp": 0.99694824, + "epoch": 0.3496663259784765, + "flos": 70014299502240.0, + "grad_norm": 1.091088354713504, + "language_loss": 0.64787102, + "learning_rate": 3.0206161931573526e-06, + "loss": 0.6763761, + "num_input_tokens_seen": 62472980, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 2.25, + "step": 2908, + "time_per_iteration": 3.3504297733306885 + }, + { + "auxiliary_loss_clip": 0.01541609, + "auxiliary_loss_mlp": 0.01315807, + "balance_loss_clip": 1.18950677, + "balance_loss_mlp": 1.03504562, + "epoch": 0.34978656886911563, + "flos": 28695253013280.0, + "grad_norm": 1.6540046046706791, + "language_loss": 0.92887068, + "learning_rate": 3.0199462066448388e-06, + "loss": 0.95744485, + "num_input_tokens_seen": 62495175, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.8125, + "step": 2909, + "time_per_iteration": 3.122483491897583 + }, + { + "auxiliary_loss_clip": 0.01546024, + "auxiliary_loss_mlp": 0.01315551, + "balance_loss_clip": 1.19269979, + "balance_loss_mlp": 1.03192878, + "epoch": 0.3499068117597547, + "flos": 21144287793600.0, + "grad_norm": 1.8211741166779956, + "language_loss": 0.69179595, + "learning_rate": 3.019276065407296e-06, + "loss": 0.72041172, + "num_input_tokens_seen": 62514295, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.83984375, + "step": 2910, + "time_per_iteration": 3.0466854572296143 + }, + { + "auxiliary_loss_clip": 0.01544082, + "auxiliary_loss_mlp": 0.01313522, + "balance_loss_clip": 1.19093108, + "balance_loss_mlp": 1.03104329, + "epoch": 0.3500270546503938, + "flos": 22784848820640.0, + "grad_norm": 2.051399709407933, + "language_loss": 0.80705523, + "learning_rate": 3.018605769546385e-06, + "loss": 0.83563125, + "num_input_tokens_seen": 62534850, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.83007812, + "step": 2911, + "time_per_iteration": 2.994992733001709 + }, + { + "auxiliary_loss_clip": 0.01540694, + "auxiliary_loss_mlp": 0.01317231, + "balance_loss_clip": 1.18659854, + "balance_loss_mlp": 1.03437161, + "epoch": 0.3501472975410329, + "flos": 22896320775840.0, + "grad_norm": 2.0632608563900208, + "language_loss": 0.79627275, + "learning_rate": 3.017935319163788e-06, + "loss": 0.82485199, + "num_input_tokens_seen": 62553810, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.83398438, + "step": 2912, + "time_per_iteration": 3.05802321434021 + }, + { + "auxiliary_loss_clip": 0.01544312, + "auxiliary_loss_mlp": 0.01328475, + "balance_loss_clip": 1.19261932, + "balance_loss_mlp": 1.03951204, + "epoch": 0.35026754043167196, + "flos": 25448911446240.0, + "grad_norm": 1.7844171683256167, + "language_loss": 0.70617056, + "learning_rate": 3.017264714361213e-06, + "loss": 0.73489845, + "num_input_tokens_seen": 62573460, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.89257812, + "step": 2913, + "time_per_iteration": 2.971428155899048 + }, + { + "auxiliary_loss_clip": 0.01540236, + "auxiliary_loss_mlp": 0.01335685, + "balance_loss_clip": 1.18710601, + "balance_loss_mlp": 1.04881954, + "epoch": 0.3503877833223111, + "flos": 19575828927360.0, + "grad_norm": 2.908188505378413, + "language_loss": 0.82195783, + "learning_rate": 3.016593955240389e-06, + "loss": 0.85071701, + "num_input_tokens_seen": 62592150, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.87109375, + "step": 2914, + "time_per_iteration": 3.1003754138946533 + }, + { + "auxiliary_loss_clip": 0.0161645, + "auxiliary_loss_mlp": 0.01250626, + "balance_loss_clip": 1.26616144, + "balance_loss_mlp": 1.01335144, + "epoch": 0.3505080262129502, + "flos": 65078772278400.0, + "grad_norm": 0.8273646505930894, + "language_loss": 0.6362316, + "learning_rate": 3.015923041903071e-06, + "loss": 0.66490233, + "num_input_tokens_seen": 62658275, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.3671875, + "step": 2915, + "time_per_iteration": 3.5295886993408203 + }, + { + "auxiliary_loss_clip": 0.0155272, + "auxiliary_loss_mlp": 0.01322108, + "balance_loss_clip": 1.2007525, + "balance_loss_mlp": 1.03867579, + "epoch": 0.35062826910358924, + "flos": 29317129318080.0, + "grad_norm": 2.0022426026669984, + "language_loss": 0.83330429, + "learning_rate": 3.0152519744510347e-06, + "loss": 0.86205256, + "num_input_tokens_seen": 62678075, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.83984375, + "step": 2916, + "time_per_iteration": 3.053067922592163 + }, + { + "auxiliary_loss_clip": 0.01535924, + "auxiliary_loss_mlp": 0.01326328, + "balance_loss_clip": 1.18258047, + "balance_loss_mlp": 1.04041648, + "epoch": 0.35074851199422835, + "flos": 23989990199040.0, + "grad_norm": 1.800790281056462, + "language_loss": 0.82939255, + "learning_rate": 3.014580752986081e-06, + "loss": 0.85801506, + "num_input_tokens_seen": 62696950, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.86328125, + "step": 2917, + "time_per_iteration": 3.112243890762329 + }, + { + "auxiliary_loss_clip": 0.01541718, + "auxiliary_loss_mlp": 0.01320712, + "balance_loss_clip": 1.18824255, + "balance_loss_mlp": 1.03556371, + "epoch": 0.3508687548848674, + "flos": 15225842831040.0, + "grad_norm": 1.9540256990416807, + "language_loss": 0.78455555, + "learning_rate": 3.0139093776100345e-06, + "loss": 0.81317985, + "num_input_tokens_seen": 62713540, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.85546875, + "step": 2918, + "time_per_iteration": 3.0011484622955322 + }, + { + "auxiliary_loss_clip": 0.01538793, + "auxiliary_loss_mlp": 0.01316779, + "balance_loss_clip": 1.18555939, + "balance_loss_mlp": 1.03544497, + "epoch": 0.3509889977755065, + "flos": 21363931954080.0, + "grad_norm": 1.9809556681966871, + "language_loss": 0.75750273, + "learning_rate": 3.013237848424741e-06, + "loss": 0.78605843, + "num_input_tokens_seen": 62732925, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.81835938, + "step": 2919, + "time_per_iteration": 3.094181776046753 + }, + { + "auxiliary_loss_clip": 0.01541711, + "auxiliary_loss_mlp": 0.0131683, + "balance_loss_clip": 1.18886232, + "balance_loss_mlp": 1.03301656, + "epoch": 0.35110924066614563, + "flos": 19137375025920.0, + "grad_norm": 3.0197289953320094, + "language_loss": 0.75635624, + "learning_rate": 3.012566165532072e-06, + "loss": 0.78494161, + "num_input_tokens_seen": 62751715, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.84375, + "step": 2920, + "time_per_iteration": 3.0692965984344482 + }, + { + "auxiliary_loss_clip": 0.01541727, + "auxiliary_loss_mlp": 0.0131369, + "balance_loss_clip": 1.18987918, + "balance_loss_mlp": 1.03330958, + "epoch": 0.3512294835567847, + "flos": 21983039503200.0, + "grad_norm": 2.191263259631135, + "language_loss": 0.76888347, + "learning_rate": 3.0118943290339207e-06, + "loss": 0.79743767, + "num_input_tokens_seen": 62771925, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.80664062, + "step": 2921, + "time_per_iteration": 3.0300283432006836 + }, + { + "auxiliary_loss_clip": 0.01543715, + "auxiliary_loss_mlp": 0.01317649, + "balance_loss_clip": 1.19144022, + "balance_loss_mlp": 1.03536153, + "epoch": 0.3513497264474238, + "flos": 17818979068800.0, + "grad_norm": 2.3367455952147624, + "language_loss": 0.68099493, + "learning_rate": 3.011222339032204e-06, + "loss": 0.70960855, + "num_input_tokens_seen": 62790075, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.828125, + "step": 2922, + "time_per_iteration": 3.1135082244873047 + }, + { + "auxiliary_loss_clip": 0.01539809, + "auxiliary_loss_mlp": 0.01309454, + "balance_loss_clip": 1.18722558, + "balance_loss_mlp": 1.02964544, + "epoch": 0.3514699693380629, + "flos": 26945875002240.0, + "grad_norm": 2.111009841040159, + "language_loss": 0.69399655, + "learning_rate": 3.0105501956288626e-06, + "loss": 0.72248912, + "num_input_tokens_seen": 62810545, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.80273438, + "step": 2923, + "time_per_iteration": 3.077716827392578 + }, + { + "auxiliary_loss_clip": 0.01539457, + "auxiliary_loss_mlp": 0.01320489, + "balance_loss_clip": 1.18599689, + "balance_loss_mlp": 1.03743815, + "epoch": 0.35159021222870196, + "flos": 15269346794880.0, + "grad_norm": 2.2928857794787505, + "language_loss": 0.72776496, + "learning_rate": 3.0098778989258602e-06, + "loss": 0.75636446, + "num_input_tokens_seen": 62829155, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.8359375, + "step": 2924, + "time_per_iteration": 3.987980842590332 + }, + { + "auxiliary_loss_clip": 0.01546702, + "auxiliary_loss_mlp": 0.01312128, + "balance_loss_clip": 1.1936779, + "balance_loss_mlp": 1.03174734, + "epoch": 0.35171045511934107, + "flos": 13985503755840.0, + "grad_norm": 2.1376882566804634, + "language_loss": 0.88082331, + "learning_rate": 3.009205449025183e-06, + "loss": 0.90941161, + "num_input_tokens_seen": 62845350, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.80859375, + "step": 2925, + "time_per_iteration": 4.012751340866089 + }, + { + "auxiliary_loss_clip": 0.01538424, + "auxiliary_loss_mlp": 0.01300147, + "balance_loss_clip": 1.18413246, + "balance_loss_mlp": 1.02167439, + "epoch": 0.3518306980099802, + "flos": 14285214990720.0, + "grad_norm": 2.2213628102868093, + "language_loss": 0.63151491, + "learning_rate": 3.008532846028842e-06, + "loss": 0.65990067, + "num_input_tokens_seen": 62862110, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.7890625, + "step": 2926, + "time_per_iteration": 3.0969972610473633 + }, + { + "auxiliary_loss_clip": 0.01539239, + "auxiliary_loss_mlp": 0.01316436, + "balance_loss_clip": 1.18608236, + "balance_loss_mlp": 1.03510249, + "epoch": 0.35195094090061924, + "flos": 27055336764960.0, + "grad_norm": 2.7841821808365737, + "language_loss": 0.72003341, + "learning_rate": 3.0078600900388694e-06, + "loss": 0.74859017, + "num_input_tokens_seen": 62882415, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.81835938, + "step": 2927, + "time_per_iteration": 2.9857349395751953 + }, + { + "auxiliary_loss_clip": 0.01536389, + "auxiliary_loss_mlp": 0.01306217, + "balance_loss_clip": 1.1834538, + "balance_loss_mlp": 1.02602768, + "epoch": 0.35207118379125835, + "flos": 25631613214560.0, + "grad_norm": 2.3193749453939208, + "language_loss": 0.7406776, + "learning_rate": 3.007187181157323e-06, + "loss": 0.76910371, + "num_input_tokens_seen": 62902425, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.80664062, + "step": 2928, + "time_per_iteration": 3.873512029647827 + }, + { + "auxiliary_loss_clip": 0.01537242, + "auxiliary_loss_mlp": 0.01321713, + "balance_loss_clip": 1.18407369, + "balance_loss_mlp": 1.0420959, + "epoch": 0.35219142668189746, + "flos": 18006687354240.0, + "grad_norm": 2.5915685473756604, + "language_loss": 0.68392515, + "learning_rate": 3.006514119486282e-06, + "loss": 0.71251464, + "num_input_tokens_seen": 62919255, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.80078125, + "step": 2929, + "time_per_iteration": 3.02296781539917 + }, + { + "auxiliary_loss_clip": 0.01532364, + "auxiliary_loss_mlp": 0.01304997, + "balance_loss_clip": 1.17865443, + "balance_loss_mlp": 1.02671504, + "epoch": 0.3523116695725365, + "flos": 14029880067360.0, + "grad_norm": 1.784888296624185, + "language_loss": 0.70027065, + "learning_rate": 3.005840905127849e-06, + "loss": 0.72864425, + "num_input_tokens_seen": 62936160, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.78710938, + "step": 2930, + "time_per_iteration": 2.9263572692871094 + }, + { + "auxiliary_loss_clip": 0.01535677, + "auxiliary_loss_mlp": 0.01298033, + "balance_loss_clip": 1.18199146, + "balance_loss_mlp": 1.01803434, + "epoch": 0.3524319124631756, + "flos": 21436109971200.0, + "grad_norm": 2.350151958517318, + "language_loss": 0.86754024, + "learning_rate": 3.0051675381841516e-06, + "loss": 0.89587736, + "num_input_tokens_seen": 62953470, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.8046875, + "step": 2931, + "time_per_iteration": 3.8341152667999268 + }, + { + "auxiliary_loss_clip": 0.0153313, + "auxiliary_loss_mlp": 0.0131336, + "balance_loss_clip": 1.17849016, + "balance_loss_mlp": 1.03317022, + "epoch": 0.3525521553538147, + "flos": 26325174470400.0, + "grad_norm": 1.9638723459326348, + "language_loss": 0.76743078, + "learning_rate": 3.0044940187573363e-06, + "loss": 0.7958957, + "num_input_tokens_seen": 62974480, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.80664062, + "step": 2932, + "time_per_iteration": 3.0357003211975098 + }, + { + "auxiliary_loss_clip": 0.01537166, + "auxiliary_loss_mlp": 0.0131489, + "balance_loss_clip": 1.18255472, + "balance_loss_mlp": 1.03317499, + "epoch": 0.3526723982444538, + "flos": 21545457949440.0, + "grad_norm": 2.7299523040703977, + "language_loss": 0.6500569, + "learning_rate": 3.003820346949578e-06, + "loss": 0.67857748, + "num_input_tokens_seen": 62992560, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.82226562, + "step": 2933, + "time_per_iteration": 3.035883665084839 + }, + { + "auxiliary_loss_clip": 0.0152871, + "auxiliary_loss_mlp": 0.01307785, + "balance_loss_clip": 1.17505765, + "balance_loss_mlp": 1.02492523, + "epoch": 0.3527926411350929, + "flos": 23735982761280.0, + "grad_norm": 2.1367513395692432, + "language_loss": 0.797189, + "learning_rate": 3.003146522863071e-06, + "loss": 0.82555395, + "num_input_tokens_seen": 63013445, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.83398438, + "step": 2934, + "time_per_iteration": 3.073906183242798 + }, + { + "auxiliary_loss_clip": 0.0153505, + "auxiliary_loss_mlp": 0.01304303, + "balance_loss_clip": 1.18202472, + "balance_loss_mlp": 1.02430391, + "epoch": 0.35291288402573195, + "flos": 30448082486880.0, + "grad_norm": 2.5448062357980947, + "language_loss": 0.85876524, + "learning_rate": 3.0024725466000345e-06, + "loss": 0.88715881, + "num_input_tokens_seen": 63033400, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.8046875, + "step": 2935, + "time_per_iteration": 3.0554580688476562 + }, + { + "auxiliary_loss_clip": 0.01540921, + "auxiliary_loss_mlp": 0.0130946, + "balance_loss_clip": 1.18759513, + "balance_loss_mlp": 1.03098679, + "epoch": 0.35303312691637107, + "flos": 23114409881760.0, + "grad_norm": 2.577168267964127, + "language_loss": 0.79181433, + "learning_rate": 3.0017984182627087e-06, + "loss": 0.82031822, + "num_input_tokens_seen": 63052725, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.7890625, + "step": 2936, + "time_per_iteration": 3.081965208053589 + }, + { + "auxiliary_loss_clip": 0.01539533, + "auxiliary_loss_mlp": 0.0131387, + "balance_loss_clip": 1.18786025, + "balance_loss_mlp": 1.03253615, + "epoch": 0.3531533698070102, + "flos": 21837886977600.0, + "grad_norm": 2.2686597835559397, + "language_loss": 0.82287729, + "learning_rate": 3.00112413795336e-06, + "loss": 0.85141134, + "num_input_tokens_seen": 63072560, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.81835938, + "step": 2937, + "time_per_iteration": 3.089759111404419 + }, + { + "auxiliary_loss_clip": 0.01535018, + "auxiliary_loss_mlp": 0.0131558, + "balance_loss_clip": 1.18356991, + "balance_loss_mlp": 1.0323391, + "epoch": 0.35327361269764923, + "flos": 15780813132960.0, + "grad_norm": 2.003049184203648, + "language_loss": 0.80313021, + "learning_rate": 3.000449705774275e-06, + "loss": 0.83163619, + "num_input_tokens_seen": 63090800, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.83789062, + "step": 2938, + "time_per_iteration": 3.117274284362793 + }, + { + "auxiliary_loss_clip": 0.01548022, + "auxiliary_loss_mlp": 0.01313001, + "balance_loss_clip": 1.19563246, + "balance_loss_mlp": 1.03109515, + "epoch": 0.35339385558828834, + "flos": 22092577122240.0, + "grad_norm": 2.2252683238325366, + "language_loss": 0.71604884, + "learning_rate": 2.9997751218277654e-06, + "loss": 0.74465907, + "num_input_tokens_seen": 63108955, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.82421875, + "step": 2939, + "time_per_iteration": 3.0829358100891113 + }, + { + "auxiliary_loss_clip": 0.01542486, + "auxiliary_loss_mlp": 0.01316574, + "balance_loss_clip": 1.18888807, + "balance_loss_mlp": 1.03504944, + "epoch": 0.35351409847892745, + "flos": 24167002743360.0, + "grad_norm": 2.538530281904401, + "language_loss": 0.7781589, + "learning_rate": 2.999100386216166e-06, + "loss": 0.80674958, + "num_input_tokens_seen": 63127895, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.8203125, + "step": 2940, + "time_per_iteration": 3.059354305267334 + }, + { + "auxiliary_loss_clip": 0.01538708, + "auxiliary_loss_mlp": 0.01310669, + "balance_loss_clip": 1.18658996, + "balance_loss_mlp": 1.03028834, + "epoch": 0.3536343413695665, + "flos": 27054729914400.0, + "grad_norm": 3.583219170864832, + "language_loss": 0.74575996, + "learning_rate": 2.998425499041831e-06, + "loss": 0.77425373, + "num_input_tokens_seen": 63148410, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.80859375, + "step": 2941, + "time_per_iteration": 3.0030856132507324 + }, + { + "auxiliary_loss_clip": 0.01604043, + "auxiliary_loss_mlp": 0.01252098, + "balance_loss_clip": 1.25389647, + "balance_loss_mlp": 1.01940155, + "epoch": 0.3537545842602056, + "flos": 65998197912960.0, + "grad_norm": 1.2678350899704605, + "language_loss": 0.64537537, + "learning_rate": 2.997750460407142e-06, + "loss": 0.67393678, + "num_input_tokens_seen": 63209765, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.3203125, + "step": 2942, + "time_per_iteration": 3.541621208190918 + }, + { + "auxiliary_loss_clip": 0.01538687, + "auxiliary_loss_mlp": 0.01304805, + "balance_loss_clip": 1.1861999, + "balance_loss_mlp": 1.02308917, + "epoch": 0.35387482715084473, + "flos": 18438579684000.0, + "grad_norm": 2.4442513554364704, + "language_loss": 0.70071971, + "learning_rate": 2.997075270414501e-06, + "loss": 0.72915471, + "num_input_tokens_seen": 63226980, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.82226562, + "step": 2943, + "time_per_iteration": 2.969505786895752 + }, + { + "auxiliary_loss_clip": 0.01601994, + "auxiliary_loss_mlp": 0.01243622, + "balance_loss_clip": 1.25150132, + "balance_loss_mlp": 1.01397705, + "epoch": 0.3539950700414838, + "flos": 65595624415200.0, + "grad_norm": 0.7095073939379043, + "language_loss": 0.57671726, + "learning_rate": 2.9963999291663347e-06, + "loss": 0.60517347, + "num_input_tokens_seen": 63292760, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.2890625, + "step": 2944, + "time_per_iteration": 3.410715103149414 + }, + { + "auxiliary_loss_clip": 0.01542622, + "auxiliary_loss_mlp": 0.01308835, + "balance_loss_clip": 1.19065619, + "balance_loss_mlp": 1.02979016, + "epoch": 0.3541153129321229, + "flos": 20523738974400.0, + "grad_norm": 2.6390844937216724, + "language_loss": 0.74338984, + "learning_rate": 2.9957244367650915e-06, + "loss": 0.77190447, + "num_input_tokens_seen": 63309005, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.79492188, + "step": 2945, + "time_per_iteration": 2.9871714115142822 + }, + { + "auxiliary_loss_clip": 0.01548281, + "auxiliary_loss_mlp": 0.0130982, + "balance_loss_clip": 1.19661307, + "balance_loss_mlp": 1.03096533, + "epoch": 0.354235555822762, + "flos": 19575639286560.0, + "grad_norm": 2.1070742146595194, + "language_loss": 0.83887631, + "learning_rate": 2.9950487933132425e-06, + "loss": 0.86745733, + "num_input_tokens_seen": 63326420, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.79296875, + "step": 2946, + "time_per_iteration": 2.9077765941619873 + }, + { + "auxiliary_loss_clip": 0.01540265, + "auxiliary_loss_mlp": 0.01308273, + "balance_loss_clip": 1.18773532, + "balance_loss_mlp": 1.02484095, + "epoch": 0.35435579871340106, + "flos": 20779377323040.0, + "grad_norm": 2.120017214876589, + "language_loss": 0.71617746, + "learning_rate": 2.994372998913283e-06, + "loss": 0.74466288, + "num_input_tokens_seen": 63344925, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.83789062, + "step": 2947, + "time_per_iteration": 2.997467517852783 + }, + { + "auxiliary_loss_clip": 0.01544072, + "auxiliary_loss_mlp": 0.01307646, + "balance_loss_clip": 1.19137704, + "balance_loss_mlp": 1.02650309, + "epoch": 0.35447604160404017, + "flos": 23954109795360.0, + "grad_norm": 2.4602987225734494, + "language_loss": 0.62348509, + "learning_rate": 2.99369705366773e-06, + "loss": 0.65200222, + "num_input_tokens_seen": 63365170, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.81640625, + "step": 2948, + "time_per_iteration": 3.2115581035614014 + }, + { + "auxiliary_loss_clip": 0.01542649, + "auxiliary_loss_mlp": 0.0130033, + "balance_loss_clip": 1.19018662, + "balance_loss_mlp": 1.02090311, + "epoch": 0.3545962844946792, + "flos": 23437636940160.0, + "grad_norm": 2.286539460391196, + "language_loss": 0.82276595, + "learning_rate": 2.9930209576791244e-06, + "loss": 0.85119569, + "num_input_tokens_seen": 63383645, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.79882812, + "step": 2949, + "time_per_iteration": 3.205230951309204 + }, + { + "auxiliary_loss_clip": 0.01539173, + "auxiliary_loss_mlp": 0.01296267, + "balance_loss_clip": 1.18712807, + "balance_loss_mlp": 1.01798439, + "epoch": 0.35471652738531834, + "flos": 22087001682720.0, + "grad_norm": 2.6981846569532113, + "language_loss": 0.63837451, + "learning_rate": 2.9923447110500285e-06, + "loss": 0.66672891, + "num_input_tokens_seen": 63402390, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.78710938, + "step": 2950, + "time_per_iteration": 3.019906997680664 + }, + { + "auxiliary_loss_clip": 0.01538498, + "auxiliary_loss_mlp": 0.01299178, + "balance_loss_clip": 1.18608892, + "balance_loss_mlp": 1.02261281, + "epoch": 0.35483677027595745, + "flos": 27344200546080.0, + "grad_norm": 1.6740802987206835, + "language_loss": 0.75473613, + "learning_rate": 2.9916683138830295e-06, + "loss": 0.78311288, + "num_input_tokens_seen": 63423055, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.76953125, + "step": 2951, + "time_per_iteration": 3.0256011486053467 + }, + { + "auxiliary_loss_clip": 0.01540133, + "auxiliary_loss_mlp": 0.0130463, + "balance_loss_clip": 1.18844283, + "balance_loss_mlp": 1.02558517, + "epoch": 0.3549570131665965, + "flos": 13517844806880.0, + "grad_norm": 2.0953605047037795, + "language_loss": 0.80895722, + "learning_rate": 2.9909917662807353e-06, + "loss": 0.83740479, + "num_input_tokens_seen": 63440855, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.79492188, + "step": 2952, + "time_per_iteration": 4.713984966278076 + }, + { + "auxiliary_loss_clip": 0.0154014, + "auxiliary_loss_mlp": 0.01306607, + "balance_loss_clip": 1.1869446, + "balance_loss_mlp": 1.02470088, + "epoch": 0.3550772560572356, + "flos": 20889711433440.0, + "grad_norm": 2.587068514542846, + "language_loss": 0.68919623, + "learning_rate": 2.9903150683457783e-06, + "loss": 0.71766365, + "num_input_tokens_seen": 63459400, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.82421875, + "step": 2953, + "time_per_iteration": 3.032421350479126 + }, + { + "auxiliary_loss_clip": 0.01540477, + "auxiliary_loss_mlp": 0.01305991, + "balance_loss_clip": 1.18774438, + "balance_loss_mlp": 1.02789998, + "epoch": 0.3551974989478747, + "flos": 20196150177600.0, + "grad_norm": 2.1577809955854006, + "language_loss": 0.64999115, + "learning_rate": 2.9896382201808126e-06, + "loss": 0.67845583, + "num_input_tokens_seen": 63476800, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.78515625, + "step": 2954, + "time_per_iteration": 3.0437614917755127 + }, + { + "auxiliary_loss_clip": 0.0154529, + "auxiliary_loss_mlp": 0.01302275, + "balance_loss_clip": 1.19258773, + "balance_loss_mlp": 1.02094078, + "epoch": 0.3553177418385138, + "flos": 19830898353600.0, + "grad_norm": 2.6696063239789476, + "language_loss": 0.8146565, + "learning_rate": 2.988961221888516e-06, + "loss": 0.84313208, + "num_input_tokens_seen": 63493475, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.81835938, + "step": 2955, + "time_per_iteration": 3.030759334564209 + }, + { + "auxiliary_loss_clip": 0.01543034, + "auxiliary_loss_mlp": 0.01333206, + "balance_loss_clip": 1.19086671, + "balance_loss_mlp": 1.05530512, + "epoch": 0.3554379847291529, + "flos": 14830854965280.0, + "grad_norm": 2.529327586671477, + "language_loss": 0.79029536, + "learning_rate": 2.988284073571589e-06, + "loss": 0.81905782, + "num_input_tokens_seen": 63509560, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.78320312, + "step": 2956, + "time_per_iteration": 3.877519369125366 + }, + { + "auxiliary_loss_clip": 0.01541807, + "auxiliary_loss_mlp": 0.0130721, + "balance_loss_clip": 1.18968916, + "balance_loss_mlp": 1.02454114, + "epoch": 0.355558227619792, + "flos": 20487441360960.0, + "grad_norm": 2.5978336290051702, + "language_loss": 0.73085201, + "learning_rate": 2.9876067753327528e-06, + "loss": 0.75934219, + "num_input_tokens_seen": 63527290, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.83203125, + "step": 2957, + "time_per_iteration": 2.982191801071167 + }, + { + "auxiliary_loss_clip": 0.01541478, + "auxiliary_loss_mlp": 0.01309499, + "balance_loss_clip": 1.18914509, + "balance_loss_mlp": 1.02854657, + "epoch": 0.35567847051043106, + "flos": 37666300680000.0, + "grad_norm": 2.2436336960611167, + "language_loss": 0.80671477, + "learning_rate": 2.986929327274754e-06, + "loss": 0.83522463, + "num_input_tokens_seen": 63547870, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.81445312, + "step": 2958, + "time_per_iteration": 3.166334390640259 + }, + { + "auxiliary_loss_clip": 0.01539263, + "auxiliary_loss_mlp": 0.01304246, + "balance_loss_clip": 1.18657827, + "balance_loss_mlp": 1.02481997, + "epoch": 0.35579871340107017, + "flos": 26945495720640.0, + "grad_norm": 1.8099473881854433, + "language_loss": 0.78777593, + "learning_rate": 2.9862517295003617e-06, + "loss": 0.81621104, + "num_input_tokens_seen": 63568285, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.79882812, + "step": 2959, + "time_per_iteration": 3.9196081161499023 + }, + { + "auxiliary_loss_clip": 0.01533028, + "auxiliary_loss_mlp": 0.01303435, + "balance_loss_clip": 1.17952061, + "balance_loss_mlp": 1.02458072, + "epoch": 0.3559189562917093, + "flos": 28295865480960.0, + "grad_norm": 1.5585605635816961, + "language_loss": 0.72786939, + "learning_rate": 2.9855739821123654e-06, + "loss": 0.75623405, + "num_input_tokens_seen": 63589865, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.79296875, + "step": 2960, + "time_per_iteration": 3.075272798538208 + }, + { + "auxiliary_loss_clip": 0.01532496, + "auxiliary_loss_mlp": 0.01304067, + "balance_loss_clip": 1.17859983, + "balance_loss_mlp": 1.02635729, + "epoch": 0.35603919918234833, + "flos": 25666317845280.0, + "grad_norm": 2.036417666353884, + "language_loss": 0.82672918, + "learning_rate": 2.98489608521358e-06, + "loss": 0.85509479, + "num_input_tokens_seen": 63609805, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.78125, + "step": 2961, + "time_per_iteration": 2.972748279571533 + }, + { + "auxiliary_loss_clip": 0.01537557, + "auxiliary_loss_mlp": 0.01320005, + "balance_loss_clip": 1.18273711, + "balance_loss_mlp": 1.03943443, + "epoch": 0.35615944207298744, + "flos": 23002975854720.0, + "grad_norm": 2.3177140434689214, + "language_loss": 0.79946733, + "learning_rate": 2.9842180389068425e-06, + "loss": 0.82804298, + "num_input_tokens_seen": 63627115, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.81054688, + "step": 2962, + "time_per_iteration": 3.0322794914245605 + }, + { + "auxiliary_loss_clip": 0.01572541, + "auxiliary_loss_mlp": 0.01234818, + "balance_loss_clip": 1.21896839, + "balance_loss_mlp": 1.00593567, + "epoch": 0.35627968496362655, + "flos": 68258435411520.0, + "grad_norm": 0.8369831430073599, + "language_loss": 0.59179139, + "learning_rate": 2.98353984329501e-06, + "loss": 0.619865, + "num_input_tokens_seen": 63691460, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.2890625, + "step": 2963, + "time_per_iteration": 3.5269899368286133 + }, + { + "auxiliary_loss_clip": 0.01534624, + "auxiliary_loss_mlp": 0.01322618, + "balance_loss_clip": 1.18130004, + "balance_loss_mlp": 1.0382328, + "epoch": 0.3563999278542656, + "flos": 22644133889760.0, + "grad_norm": 2.002393236351514, + "language_loss": 0.70624328, + "learning_rate": 2.982861498480965e-06, + "loss": 0.73481572, + "num_input_tokens_seen": 63713840, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.84765625, + "step": 2964, + "time_per_iteration": 3.120847463607788 + }, + { + "auxiliary_loss_clip": 0.01533423, + "auxiliary_loss_mlp": 0.01317894, + "balance_loss_clip": 1.17989421, + "balance_loss_mlp": 1.03408051, + "epoch": 0.3565201707449047, + "flos": 25954612704000.0, + "grad_norm": 1.6688344179311687, + "language_loss": 0.82872862, + "learning_rate": 2.9821830045676122e-06, + "loss": 0.85724187, + "num_input_tokens_seen": 63733540, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.84375, + "step": 2965, + "time_per_iteration": 3.016897439956665 + }, + { + "auxiliary_loss_clip": 0.01536947, + "auxiliary_loss_mlp": 0.01328722, + "balance_loss_clip": 1.18248868, + "balance_loss_mlp": 1.04414582, + "epoch": 0.3566404136355438, + "flos": 28478263824000.0, + "grad_norm": 2.7915713093639996, + "language_loss": 0.72974086, + "learning_rate": 2.9815043616578793e-06, + "loss": 0.75839746, + "num_input_tokens_seen": 63754335, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.8515625, + "step": 2966, + "time_per_iteration": 2.986501932144165 + }, + { + "auxiliary_loss_clip": 0.01534004, + "auxiliary_loss_mlp": 0.01316116, + "balance_loss_clip": 1.17980719, + "balance_loss_mlp": 1.0315392, + "epoch": 0.3567606565261829, + "flos": 38366195938560.0, + "grad_norm": 3.3486917067391477, + "language_loss": 0.76886153, + "learning_rate": 2.9808255698547145e-06, + "loss": 0.7973628, + "num_input_tokens_seen": 63777135, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.84960938, + "step": 2967, + "time_per_iteration": 3.223145008087158 + }, + { + "auxiliary_loss_clip": 0.01537985, + "auxiliary_loss_mlp": 0.01313732, + "balance_loss_clip": 1.18290949, + "balance_loss_mlp": 1.03602183, + "epoch": 0.356880899416822, + "flos": 21983191215840.0, + "grad_norm": 2.099609373111382, + "language_loss": 0.79622352, + "learning_rate": 2.9801466292610913e-06, + "loss": 0.82474071, + "num_input_tokens_seen": 63797020, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.78125, + "step": 2968, + "time_per_iteration": 2.988701820373535 + }, + { + "auxiliary_loss_clip": 0.01527395, + "auxiliary_loss_mlp": 0.01329064, + "balance_loss_clip": 1.17313302, + "balance_loss_mlp": 1.04849303, + "epoch": 0.35700114230746105, + "flos": 18991122583680.0, + "grad_norm": 4.486328157052282, + "language_loss": 0.81379735, + "learning_rate": 2.979467539980003e-06, + "loss": 0.84236193, + "num_input_tokens_seen": 63813810, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.81054688, + "step": 2969, + "time_per_iteration": 2.9195055961608887 + }, + { + "auxiliary_loss_clip": 0.01528726, + "auxiliary_loss_mlp": 0.01335984, + "balance_loss_clip": 1.17500496, + "balance_loss_mlp": 1.05083549, + "epoch": 0.35712138519810016, + "flos": 19758189342240.0, + "grad_norm": 2.015710950909613, + "language_loss": 0.7696265, + "learning_rate": 2.978788302114468e-06, + "loss": 0.79827356, + "num_input_tokens_seen": 63830925, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.85742188, + "step": 2970, + "time_per_iteration": 3.0442709922790527 + }, + { + "auxiliary_loss_clip": 0.01526427, + "auxiliary_loss_mlp": 0.01328633, + "balance_loss_clip": 1.17019618, + "balance_loss_mlp": 1.04462862, + "epoch": 0.35724162808873927, + "flos": 35185584601440.0, + "grad_norm": 2.3372990918096646, + "language_loss": 0.83194816, + "learning_rate": 2.9781089157675255e-06, + "loss": 0.86049873, + "num_input_tokens_seen": 63849385, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.84570312, + "step": 2971, + "time_per_iteration": 3.124119281768799 + }, + { + "auxiliary_loss_clip": 0.0152136, + "auxiliary_loss_mlp": 0.0133107, + "balance_loss_clip": 1.1651715, + "balance_loss_mlp": 1.04801941, + "epoch": 0.3573618709793783, + "flos": 25559093844000.0, + "grad_norm": 1.4833852275005963, + "language_loss": 0.88536894, + "learning_rate": 2.977429381042238e-06, + "loss": 0.91389334, + "num_input_tokens_seen": 63870060, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.8359375, + "step": 2972, + "time_per_iteration": 3.0246009826660156 + }, + { + "auxiliary_loss_clip": 0.01528125, + "auxiliary_loss_mlp": 0.01318829, + "balance_loss_clip": 1.17327476, + "balance_loss_mlp": 1.03978384, + "epoch": 0.35748211387001744, + "flos": 29134920615840.0, + "grad_norm": 2.122040710716682, + "language_loss": 0.89356995, + "learning_rate": 2.9767496980416913e-06, + "loss": 0.92203945, + "num_input_tokens_seen": 63889355, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.79492188, + "step": 2973, + "time_per_iteration": 3.045618772506714 + }, + { + "auxiliary_loss_clip": 0.01522686, + "auxiliary_loss_mlp": 0.01320873, + "balance_loss_clip": 1.16813445, + "balance_loss_mlp": 1.03477049, + "epoch": 0.35760235676065655, + "flos": 13955881498560.0, + "grad_norm": 2.3502523441963525, + "language_loss": 0.81251758, + "learning_rate": 2.9760698668689914e-06, + "loss": 0.84095311, + "num_input_tokens_seen": 63905580, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.86523438, + "step": 2974, + "time_per_iteration": 2.9775261878967285 + }, + { + "auxiliary_loss_clip": 0.01520932, + "auxiliary_loss_mlp": 0.01314526, + "balance_loss_clip": 1.16584492, + "balance_loss_mlp": 1.03242946, + "epoch": 0.3577225996512956, + "flos": 44022099627360.0, + "grad_norm": 2.4603208104713277, + "language_loss": 0.71135128, + "learning_rate": 2.975389887627269e-06, + "loss": 0.73970586, + "num_input_tokens_seen": 63928180, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.82617188, + "step": 2975, + "time_per_iteration": 3.2032229900360107 + }, + { + "auxiliary_loss_clip": 0.01518919, + "auxiliary_loss_mlp": 0.01330457, + "balance_loss_clip": 1.16313088, + "balance_loss_mlp": 1.04702497, + "epoch": 0.3578428425419347, + "flos": 17057070540000.0, + "grad_norm": 2.4612236895028343, + "language_loss": 0.90341032, + "learning_rate": 2.9747097604196764e-06, + "loss": 0.93190408, + "num_input_tokens_seen": 63944825, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.83984375, + "step": 2976, + "time_per_iteration": 2.9774985313415527 + }, + { + "auxiliary_loss_clip": 0.01559984, + "auxiliary_loss_mlp": 0.01396233, + "balance_loss_clip": 1.20318985, + "balance_loss_mlp": 1.15361786, + "epoch": 0.3579630854325738, + "flos": 71683648002720.0, + "grad_norm": 0.7288903942930939, + "language_loss": 0.56548184, + "learning_rate": 2.9740294853493875e-06, + "loss": 0.59504402, + "num_input_tokens_seen": 64016385, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.421875, + "step": 2977, + "time_per_iteration": 3.7313241958618164 + }, + { + "auxiliary_loss_clip": 0.01523321, + "auxiliary_loss_mlp": 0.01305871, + "balance_loss_clip": 1.16654229, + "balance_loss_mlp": 1.02739835, + "epoch": 0.3580833283232129, + "flos": 25049030847840.0, + "grad_norm": 2.168883052906186, + "language_loss": 0.67407739, + "learning_rate": 2.9733490625196008e-06, + "loss": 0.70236933, + "num_input_tokens_seen": 64036245, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.7890625, + "step": 2978, + "time_per_iteration": 3.0991034507751465 + }, + { + "auxiliary_loss_clip": 0.01520717, + "auxiliary_loss_mlp": 0.01304958, + "balance_loss_clip": 1.16468906, + "balance_loss_mlp": 1.02781987, + "epoch": 0.358203571213852, + "flos": 13954781581920.0, + "grad_norm": 4.777072243371119, + "language_loss": 0.75563645, + "learning_rate": 2.9726684920335353e-06, + "loss": 0.78389323, + "num_input_tokens_seen": 64054110, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.77539062, + "step": 2979, + "time_per_iteration": 4.012869358062744 + }, + { + "auxiliary_loss_clip": 0.0151789, + "auxiliary_loss_mlp": 0.01305177, + "balance_loss_clip": 1.16245902, + "balance_loss_mlp": 1.02498746, + "epoch": 0.35832381410449105, + "flos": 20304512023680.0, + "grad_norm": 2.4880100006999544, + "language_loss": 0.82109833, + "learning_rate": 2.971987773994432e-06, + "loss": 0.84932894, + "num_input_tokens_seen": 64070295, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.80664062, + "step": 2980, + "time_per_iteration": 3.8119406700134277 + }, + { + "auxiliary_loss_clip": 0.01518754, + "auxiliary_loss_mlp": 0.01305648, + "balance_loss_clip": 1.16173208, + "balance_loss_mlp": 1.02622116, + "epoch": 0.35844405699513016, + "flos": 16985158020000.0, + "grad_norm": 2.0421519242703776, + "language_loss": 0.83336103, + "learning_rate": 2.9713069085055566e-06, + "loss": 0.86160505, + "num_input_tokens_seen": 64088605, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.79882812, + "step": 2981, + "time_per_iteration": 3.0300052165985107 + }, + { + "auxiliary_loss_clip": 0.01517229, + "auxiliary_loss_mlp": 0.01301983, + "balance_loss_clip": 1.15956354, + "balance_loss_mlp": 1.02179337, + "epoch": 0.35856429988576927, + "flos": 23218827199200.0, + "grad_norm": 3.159311758979293, + "language_loss": 0.79209292, + "learning_rate": 2.9706258956701958e-06, + "loss": 0.82028508, + "num_input_tokens_seen": 64108595, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.80664062, + "step": 2982, + "time_per_iteration": 3.085029125213623 + }, + { + "auxiliary_loss_clip": 0.01516587, + "auxiliary_loss_mlp": 0.01302685, + "balance_loss_clip": 1.15926337, + "balance_loss_mlp": 1.02344894, + "epoch": 0.3586845427764083, + "flos": 23036542640640.0, + "grad_norm": 4.651524221467159, + "language_loss": 0.77823687, + "learning_rate": 2.9699447355916575e-06, + "loss": 0.80642956, + "num_input_tokens_seen": 64127405, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.796875, + "step": 2983, + "time_per_iteration": 3.828052282333374 + }, + { + "auxiliary_loss_clip": 0.01513092, + "auxiliary_loss_mlp": 0.01311135, + "balance_loss_clip": 1.15491831, + "balance_loss_mlp": 1.03685808, + "epoch": 0.35880478566704743, + "flos": 20012727774240.0, + "grad_norm": 2.2118164566381324, + "language_loss": 0.74052989, + "learning_rate": 2.969263428373275e-06, + "loss": 0.76877213, + "num_input_tokens_seen": 64145755, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.74609375, + "step": 2984, + "time_per_iteration": 2.969855785369873 + }, + { + "auxiliary_loss_clip": 0.01514607, + "auxiliary_loss_mlp": 0.01314407, + "balance_loss_clip": 1.15673041, + "balance_loss_mlp": 1.0363152, + "epoch": 0.35892502855768654, + "flos": 13700812072320.0, + "grad_norm": 2.8646802117934267, + "language_loss": 0.79733181, + "learning_rate": 2.9685819741184007e-06, + "loss": 0.82562196, + "num_input_tokens_seen": 64164195, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.78515625, + "step": 2985, + "time_per_iteration": 2.9829139709472656 + }, + { + "auxiliary_loss_clip": 0.01516742, + "auxiliary_loss_mlp": 0.0130494, + "balance_loss_clip": 1.15907001, + "balance_loss_mlp": 1.02799261, + "epoch": 0.3590452714483256, + "flos": 18116073260640.0, + "grad_norm": 3.2426037948221356, + "language_loss": 0.68666458, + "learning_rate": 2.967900372930411e-06, + "loss": 0.71488142, + "num_input_tokens_seen": 64182705, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.7734375, + "step": 2986, + "time_per_iteration": 3.010383367538452 + }, + { + "auxiliary_loss_clip": 0.01516246, + "auxiliary_loss_mlp": 0.01307637, + "balance_loss_clip": 1.15919256, + "balance_loss_mlp": 1.02954531, + "epoch": 0.3591655143389647, + "flos": 17751314502720.0, + "grad_norm": 2.4087340768030536, + "language_loss": 0.78645819, + "learning_rate": 2.9672186249127046e-06, + "loss": 0.81469703, + "num_input_tokens_seen": 64202170, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.78515625, + "step": 2987, + "time_per_iteration": 3.761957883834839 + }, + { + "auxiliary_loss_clip": 0.01521392, + "auxiliary_loss_mlp": 0.0131813, + "balance_loss_clip": 1.16543317, + "balance_loss_mlp": 1.0425185, + "epoch": 0.3592857572296038, + "flos": 25226536458240.0, + "grad_norm": 2.5436051323947324, + "language_loss": 0.78729129, + "learning_rate": 2.9665367301687014e-06, + "loss": 0.81568646, + "num_input_tokens_seen": 64220415, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.75976562, + "step": 2988, + "time_per_iteration": 2.9973137378692627 + }, + { + "auxiliary_loss_clip": 0.01519459, + "auxiliary_loss_mlp": 0.01299364, + "balance_loss_clip": 1.16305733, + "balance_loss_mlp": 1.02279818, + "epoch": 0.3594060001202429, + "flos": 29386804076640.0, + "grad_norm": 2.1536011065987033, + "language_loss": 0.76650327, + "learning_rate": 2.965854688801845e-06, + "loss": 0.7946915, + "num_input_tokens_seen": 64242475, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.76953125, + "step": 2989, + "time_per_iteration": 3.0662524700164795 + }, + { + "auxiliary_loss_clip": 0.01507976, + "auxiliary_loss_mlp": 0.01297467, + "balance_loss_clip": 1.15046477, + "balance_loss_mlp": 1.02242732, + "epoch": 0.359526243010882, + "flos": 17054643137760.0, + "grad_norm": 1.965331922891553, + "language_loss": 0.76565844, + "learning_rate": 2.9651725009156005e-06, + "loss": 0.79371285, + "num_input_tokens_seen": 64260220, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.75390625, + "step": 2990, + "time_per_iteration": 3.0661795139312744 + }, + { + "auxiliary_loss_clip": 0.01511694, + "auxiliary_loss_mlp": 0.01308036, + "balance_loss_clip": 1.15496492, + "balance_loss_mlp": 1.03051686, + "epoch": 0.3596464859015211, + "flos": 22967247163680.0, + "grad_norm": 1.9650067541304963, + "language_loss": 0.74610794, + "learning_rate": 2.964490166613454e-06, + "loss": 0.77430522, + "num_input_tokens_seen": 64280145, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.77929688, + "step": 2991, + "time_per_iteration": 3.07218337059021 + }, + { + "auxiliary_loss_clip": 0.01573532, + "auxiliary_loss_mlp": 0.01210754, + "balance_loss_clip": 1.21616578, + "balance_loss_mlp": 0.98873901, + "epoch": 0.35976672879216015, + "flos": 54745179912000.0, + "grad_norm": 0.7765012972276121, + "language_loss": 0.57678151, + "learning_rate": 2.963807685998917e-06, + "loss": 0.60462439, + "num_input_tokens_seen": 64336010, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.2265625, + "step": 2992, + "time_per_iteration": 3.382364273071289 + }, + { + "auxiliary_loss_clip": 0.01509952, + "auxiliary_loss_mlp": 0.01307575, + "balance_loss_clip": 1.15204871, + "balance_loss_mlp": 1.02910197, + "epoch": 0.35988697168279926, + "flos": 43142157571680.0, + "grad_norm": 1.75585945136101, + "language_loss": 0.77900279, + "learning_rate": 2.9631250591755196e-06, + "loss": 0.80717802, + "num_input_tokens_seen": 64358725, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.7890625, + "step": 2993, + "time_per_iteration": 3.2665092945098877 + }, + { + "auxiliary_loss_clip": 0.01516857, + "auxiliary_loss_mlp": 0.01309777, + "balance_loss_clip": 1.15993369, + "balance_loss_mlp": 1.02863431, + "epoch": 0.36000721457343837, + "flos": 35848537467840.0, + "grad_norm": 1.820606486546283, + "language_loss": 0.5781709, + "learning_rate": 2.962442286246817e-06, + "loss": 0.60643727, + "num_input_tokens_seen": 64381555, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.81640625, + "step": 2994, + "time_per_iteration": 3.097468852996826 + }, + { + "auxiliary_loss_clip": 0.01514625, + "auxiliary_loss_mlp": 0.01315961, + "balance_loss_clip": 1.15783465, + "balance_loss_mlp": 1.0346272, + "epoch": 0.3601274574640774, + "flos": 18292782379680.0, + "grad_norm": 1.9533103741978797, + "language_loss": 0.69689941, + "learning_rate": 2.9617593673163853e-06, + "loss": 0.7252053, + "num_input_tokens_seen": 64400375, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.81835938, + "step": 2995, + "time_per_iteration": 3.0310075283050537 + }, + { + "auxiliary_loss_clip": 0.01512459, + "auxiliary_loss_mlp": 0.01305626, + "balance_loss_clip": 1.15423429, + "balance_loss_mlp": 1.02829695, + "epoch": 0.36024770035471654, + "flos": 13335636104640.0, + "grad_norm": 2.513477923194571, + "language_loss": 0.77429187, + "learning_rate": 2.9610763024878216e-06, + "loss": 0.80247271, + "num_input_tokens_seen": 64415880, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.77734375, + "step": 2996, + "time_per_iteration": 3.066784620285034 + }, + { + "auxiliary_loss_clip": 0.01513578, + "auxiliary_loss_mlp": 0.01316711, + "balance_loss_clip": 1.15567827, + "balance_loss_mlp": 1.03423309, + "epoch": 0.3603679432453556, + "flos": 20269124686080.0, + "grad_norm": 38.17981177409183, + "language_loss": 0.9173184, + "learning_rate": 2.960393091864747e-06, + "loss": 0.94562125, + "num_input_tokens_seen": 64434260, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.83007812, + "step": 2997, + "time_per_iteration": 3.004406213760376 + }, + { + "auxiliary_loss_clip": 0.01518237, + "auxiliary_loss_mlp": 0.01325193, + "balance_loss_clip": 1.16109276, + "balance_loss_mlp": 1.04424095, + "epoch": 0.3604881861359947, + "flos": 22453087926240.0, + "grad_norm": 2.057502476759725, + "language_loss": 0.74797595, + "learning_rate": 2.959709735550804e-06, + "loss": 0.77641028, + "num_input_tokens_seen": 64453855, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.81445312, + "step": 2998, + "time_per_iteration": 3.096031665802002 + }, + { + "auxiliary_loss_clip": 0.01514884, + "auxiliary_loss_mlp": 0.01318903, + "balance_loss_clip": 1.15798235, + "balance_loss_mlp": 1.03756905, + "epoch": 0.3606084290266338, + "flos": 22056431221440.0, + "grad_norm": 2.22672085429807, + "language_loss": 0.75984985, + "learning_rate": 2.9590262336496575e-06, + "loss": 0.78818774, + "num_input_tokens_seen": 64473585, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.81640625, + "step": 2999, + "time_per_iteration": 2.9236690998077393 + }, + { + "auxiliary_loss_clip": 0.01521848, + "auxiliary_loss_mlp": 0.01318152, + "balance_loss_clip": 1.16359115, + "balance_loss_mlp": 1.03605533, + "epoch": 0.36072867191727287, + "flos": 15634750331520.0, + "grad_norm": 2.473480159526475, + "language_loss": 0.85115081, + "learning_rate": 2.9583425862649936e-06, + "loss": 0.87955087, + "num_input_tokens_seen": 64491720, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.82617188, + "step": 3000, + "time_per_iteration": 3.036142110824585 + }, + { + "auxiliary_loss_clip": 0.01518468, + "auxiliary_loss_mlp": 0.01323999, + "balance_loss_clip": 1.1621027, + "balance_loss_mlp": 1.04056716, + "epoch": 0.360848914807912, + "flos": 19678729118400.0, + "grad_norm": 2.719791449400585, + "language_loss": 0.73798096, + "learning_rate": 2.9576587935005215e-06, + "loss": 0.7664057, + "num_input_tokens_seen": 64509800, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.83984375, + "step": 3001, + "time_per_iteration": 2.9998364448547363 + }, + { + "auxiliary_loss_clip": 0.01512058, + "auxiliary_loss_mlp": 0.01322035, + "balance_loss_clip": 1.15476894, + "balance_loss_mlp": 1.03936636, + "epoch": 0.3609691576985511, + "flos": 18879688556640.0, + "grad_norm": 4.209137104184472, + "language_loss": 0.71816933, + "learning_rate": 2.9569748554599713e-06, + "loss": 0.74651027, + "num_input_tokens_seen": 64525410, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.83203125, + "step": 3002, + "time_per_iteration": 3.005171060562134 + }, + { + "auxiliary_loss_clip": 0.01513804, + "auxiliary_loss_mlp": 0.01326513, + "balance_loss_clip": 1.15635252, + "balance_loss_mlp": 1.04575181, + "epoch": 0.36108940058919015, + "flos": 42227017819200.0, + "grad_norm": 3.1785074758719936, + "language_loss": 0.73210609, + "learning_rate": 2.956290772247097e-06, + "loss": 0.76050925, + "num_input_tokens_seen": 64544085, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.8125, + "step": 3003, + "time_per_iteration": 3.1350879669189453 + }, + { + "auxiliary_loss_clip": 0.01520852, + "auxiliary_loss_mlp": 0.01319195, + "balance_loss_clip": 1.16506791, + "balance_loss_mlp": 1.03938699, + "epoch": 0.36120964347982926, + "flos": 23187384390240.0, + "grad_norm": 2.541073744479392, + "language_loss": 0.73543465, + "learning_rate": 2.9556065439656724e-06, + "loss": 0.76383513, + "num_input_tokens_seen": 64563135, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.80273438, + "step": 3004, + "time_per_iteration": 2.9840445518493652 + }, + { + "auxiliary_loss_clip": 0.01522511, + "auxiliary_loss_mlp": 0.01306269, + "balance_loss_clip": 1.16662085, + "balance_loss_mlp": 1.02588844, + "epoch": 0.36132988637046837, + "flos": 18114973344000.0, + "grad_norm": 1.8417519107876228, + "language_loss": 0.81961554, + "learning_rate": 2.9549221707194952e-06, + "loss": 0.84790331, + "num_input_tokens_seen": 64581985, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.80859375, + "step": 3005, + "time_per_iteration": 3.091264486312866 + }, + { + "auxiliary_loss_clip": 0.01514464, + "auxiliary_loss_mlp": 0.01316092, + "balance_loss_clip": 1.15742445, + "balance_loss_mlp": 1.03723717, + "epoch": 0.3614501292611074, + "flos": 27815273029440.0, + "grad_norm": 3.930948981629683, + "language_loss": 0.73189855, + "learning_rate": 2.954237652612384e-06, + "loss": 0.76020408, + "num_input_tokens_seen": 64601035, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.79296875, + "step": 3006, + "time_per_iteration": 4.870320081710815 + }, + { + "auxiliary_loss_clip": 0.01511723, + "auxiliary_loss_mlp": 0.0130626, + "balance_loss_clip": 1.15485811, + "balance_loss_mlp": 1.02855039, + "epoch": 0.36157037215174653, + "flos": 22636965467520.0, + "grad_norm": 2.4846755124243654, + "language_loss": 0.84629285, + "learning_rate": 2.9535529897481796e-06, + "loss": 0.87447268, + "num_input_tokens_seen": 64618580, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.78125, + "step": 3007, + "time_per_iteration": 2.996170997619629 + }, + { + "auxiliary_loss_clip": 0.01510943, + "auxiliary_loss_mlp": 0.01309893, + "balance_loss_clip": 1.15408003, + "balance_loss_mlp": 1.02951229, + "epoch": 0.36169061504238564, + "flos": 12602287844640.0, + "grad_norm": 3.238171426158343, + "language_loss": 0.7669099, + "learning_rate": 2.9528681822307446e-06, + "loss": 0.79511821, + "num_input_tokens_seen": 64635430, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.80859375, + "step": 3008, + "time_per_iteration": 3.0211081504821777 + }, + { + "auxiliary_loss_clip": 0.01516195, + "auxiliary_loss_mlp": 0.01303593, + "balance_loss_clip": 1.15992367, + "balance_loss_mlp": 1.0258832, + "epoch": 0.3618108579330247, + "flos": 26686898975520.0, + "grad_norm": 2.1939364800008105, + "language_loss": 0.82432616, + "learning_rate": 2.952183230163964e-06, + "loss": 0.85252404, + "num_input_tokens_seen": 64655005, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.78125, + "step": 3009, + "time_per_iteration": 3.072282314300537 + }, + { + "auxiliary_loss_clip": 0.01510879, + "auxiliary_loss_mlp": 0.01315369, + "balance_loss_clip": 1.15404105, + "balance_loss_mlp": 1.03727758, + "epoch": 0.3619311008236638, + "flos": 22819060385280.0, + "grad_norm": 2.5037550834992492, + "language_loss": 0.7274884, + "learning_rate": 2.9514981336517448e-06, + "loss": 0.75575089, + "num_input_tokens_seen": 64674775, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.78515625, + "step": 3010, + "time_per_iteration": 3.8754212856292725 + }, + { + "auxiliary_loss_clip": 0.01515958, + "auxiliary_loss_mlp": 0.01311372, + "balance_loss_clip": 1.1588614, + "balance_loss_mlp": 1.03442526, + "epoch": 0.36205134371430286, + "flos": 25921880337600.0, + "grad_norm": 2.2856963898992784, + "language_loss": 0.81486535, + "learning_rate": 2.950812892798015e-06, + "loss": 0.84313864, + "num_input_tokens_seen": 64695670, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.7734375, + "step": 3011, + "time_per_iteration": 3.082791566848755 + }, + { + "auxiliary_loss_clip": 0.01521684, + "auxiliary_loss_mlp": 0.01301261, + "balance_loss_clip": 1.16499436, + "balance_loss_mlp": 1.02736592, + "epoch": 0.362171586604942, + "flos": 26141941707840.0, + "grad_norm": 2.1850728686547662, + "language_loss": 0.87746716, + "learning_rate": 2.9501275077067256e-06, + "loss": 0.90569663, + "num_input_tokens_seen": 64716290, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.7421875, + "step": 3012, + "time_per_iteration": 3.0671403408050537 + }, + { + "auxiliary_loss_clip": 0.01520295, + "auxiliary_loss_mlp": 0.01302069, + "balance_loss_clip": 1.16420579, + "balance_loss_mlp": 1.0262661, + "epoch": 0.3622918294955811, + "flos": 28076714386560.0, + "grad_norm": 2.6252961288701697, + "language_loss": 0.88294536, + "learning_rate": 2.949441978481848e-06, + "loss": 0.91116899, + "num_input_tokens_seen": 64737190, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.76171875, + "step": 3013, + "time_per_iteration": 3.0029258728027344 + }, + { + "auxiliary_loss_clip": 0.01518226, + "auxiliary_loss_mlp": 0.01310382, + "balance_loss_clip": 1.16073084, + "balance_loss_mlp": 1.02828562, + "epoch": 0.36241207238622014, + "flos": 19830026005920.0, + "grad_norm": 2.1175068135974473, + "language_loss": 0.80197167, + "learning_rate": 2.9487563052273778e-06, + "loss": 0.83025765, + "num_input_tokens_seen": 64753950, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.82421875, + "step": 3014, + "time_per_iteration": 2.9836716651916504 + }, + { + "auxiliary_loss_clip": 0.01519427, + "auxiliary_loss_mlp": 0.01302523, + "balance_loss_clip": 1.16241562, + "balance_loss_mlp": 1.02462196, + "epoch": 0.36253231527685925, + "flos": 21399281363520.0, + "grad_norm": 2.529257718731769, + "language_loss": 0.85895586, + "learning_rate": 2.94807048804733e-06, + "loss": 0.88717532, + "num_input_tokens_seen": 64773570, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.78320312, + "step": 3015, + "time_per_iteration": 3.979022979736328 + }, + { + "auxiliary_loss_clip": 0.01516933, + "auxiliary_loss_mlp": 0.01305072, + "balance_loss_clip": 1.15908778, + "balance_loss_mlp": 1.02469218, + "epoch": 0.36265255816749836, + "flos": 18364505258880.0, + "grad_norm": 2.059060029883424, + "language_loss": 0.90439451, + "learning_rate": 2.9473845270457434e-06, + "loss": 0.93261456, + "num_input_tokens_seen": 64790385, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.80859375, + "step": 3016, + "time_per_iteration": 3.006209135055542 + }, + { + "auxiliary_loss_clip": 0.01516722, + "auxiliary_loss_mlp": 0.01295674, + "balance_loss_clip": 1.15913701, + "balance_loss_mlp": 1.01815534, + "epoch": 0.3627728010581374, + "flos": 18661978732320.0, + "grad_norm": 4.17323410211121, + "language_loss": 0.70086026, + "learning_rate": 2.946698422326677e-06, + "loss": 0.7289843, + "num_input_tokens_seen": 64807845, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.77929688, + "step": 3017, + "time_per_iteration": 3.0252389907836914 + }, + { + "auxiliary_loss_clip": 0.01520369, + "auxiliary_loss_mlp": 0.01304549, + "balance_loss_clip": 1.16311872, + "balance_loss_mlp": 1.02416849, + "epoch": 0.36289304394877653, + "flos": 27526598889120.0, + "grad_norm": 2.4410694900556282, + "language_loss": 0.79511923, + "learning_rate": 2.946012173994213e-06, + "loss": 0.82336843, + "num_input_tokens_seen": 64827630, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.80664062, + "step": 3018, + "time_per_iteration": 3.0637049674987793 + }, + { + "auxiliary_loss_clip": 0.01521692, + "auxiliary_loss_mlp": 0.01304449, + "balance_loss_clip": 1.16232836, + "balance_loss_mlp": 1.02921832, + "epoch": 0.36301328683941564, + "flos": 34536172088160.0, + "grad_norm": 1.3881014319198428, + "language_loss": 0.67907804, + "learning_rate": 2.945325782152454e-06, + "loss": 0.70733941, + "num_input_tokens_seen": 64850665, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.75585938, + "step": 3019, + "time_per_iteration": 3.0564005374908447 + }, + { + "auxiliary_loss_clip": 0.01517543, + "auxiliary_loss_mlp": 0.01309787, + "balance_loss_clip": 1.15899014, + "balance_loss_mlp": 1.03188634, + "epoch": 0.3631335297300547, + "flos": 19027989119520.0, + "grad_norm": 2.4701084456671722, + "language_loss": 0.79017091, + "learning_rate": 2.9446392469055257e-06, + "loss": 0.81844413, + "num_input_tokens_seen": 64868700, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.78320312, + "step": 3020, + "time_per_iteration": 3.0725157260894775 + }, + { + "auxiliary_loss_clip": 0.01524545, + "auxiliary_loss_mlp": 0.01300138, + "balance_loss_clip": 1.1665411, + "balance_loss_mlp": 1.0250988, + "epoch": 0.3632537726206938, + "flos": 19538810678880.0, + "grad_norm": 2.4391579033114787, + "language_loss": 0.8002916, + "learning_rate": 2.9439525683575745e-06, + "loss": 0.82853842, + "num_input_tokens_seen": 64887620, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.75390625, + "step": 3021, + "time_per_iteration": 2.9542927742004395 + }, + { + "auxiliary_loss_clip": 0.01517361, + "auxiliary_loss_mlp": 0.01299187, + "balance_loss_clip": 1.1578685, + "balance_loss_mlp": 1.02204978, + "epoch": 0.3633740155113329, + "flos": 21070706434560.0, + "grad_norm": 2.6566467103135403, + "language_loss": 0.74921727, + "learning_rate": 2.9432657466127694e-06, + "loss": 0.77738273, + "num_input_tokens_seen": 64907190, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.77539062, + "step": 3022, + "time_per_iteration": 2.9837958812713623 + }, + { + "auxiliary_loss_clip": 0.01526518, + "auxiliary_loss_mlp": 0.01302665, + "balance_loss_clip": 1.16615748, + "balance_loss_mlp": 1.02609897, + "epoch": 0.36349425840197197, + "flos": 20300567495040.0, + "grad_norm": 1.7234781215722348, + "language_loss": 0.76668727, + "learning_rate": 2.9425787817753007e-06, + "loss": 0.7949791, + "num_input_tokens_seen": 64925850, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.76953125, + "step": 3023, + "time_per_iteration": 3.0245649814605713 + }, + { + "auxiliary_loss_clip": 0.01511532, + "auxiliary_loss_mlp": 0.01296761, + "balance_loss_clip": 1.15310979, + "balance_loss_mlp": 1.01771617, + "epoch": 0.3636145012926111, + "flos": 29719399390560.0, + "grad_norm": 1.5257753638893943, + "language_loss": 0.71221316, + "learning_rate": 2.94189167394938e-06, + "loss": 0.74029613, + "num_input_tokens_seen": 64948285, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.79492188, + "step": 3024, + "time_per_iteration": 3.071951389312744 + }, + { + "auxiliary_loss_clip": 0.01519652, + "auxiliary_loss_mlp": 0.01294731, + "balance_loss_clip": 1.16105807, + "balance_loss_mlp": 1.0179745, + "epoch": 0.3637347441832502, + "flos": 21433872209760.0, + "grad_norm": 1.8450999781873627, + "language_loss": 0.81253934, + "learning_rate": 2.941204423239241e-06, + "loss": 0.84068322, + "num_input_tokens_seen": 64967160, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.77148438, + "step": 3025, + "time_per_iteration": 3.0848474502563477 + }, + { + "auxiliary_loss_clip": 0.0151408, + "auxiliary_loss_mlp": 0.01305736, + "balance_loss_clip": 1.15390241, + "balance_loss_mlp": 1.02897954, + "epoch": 0.36385498707388925, + "flos": 29536545909600.0, + "grad_norm": 2.156863221299007, + "language_loss": 0.75947261, + "learning_rate": 2.9405170297491395e-06, + "loss": 0.78767073, + "num_input_tokens_seen": 64987155, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.77148438, + "step": 3026, + "time_per_iteration": 3.0768649578094482 + }, + { + "auxiliary_loss_clip": 0.01520227, + "auxiliary_loss_mlp": 0.01299633, + "balance_loss_clip": 1.16118765, + "balance_loss_mlp": 1.02058756, + "epoch": 0.36397522996452836, + "flos": 22238753708160.0, + "grad_norm": 2.31735662643192, + "language_loss": 0.80453736, + "learning_rate": 2.939829493583353e-06, + "loss": 0.83273596, + "num_input_tokens_seen": 65003800, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.79492188, + "step": 3027, + "time_per_iteration": 3.089447498321533 + }, + { + "auxiliary_loss_clip": 0.01518863, + "auxiliary_loss_mlp": 0.01289823, + "balance_loss_clip": 1.15851319, + "balance_loss_mlp": 1.01363909, + "epoch": 0.3640954728551674, + "flos": 21508629341760.0, + "grad_norm": 2.8576505535207275, + "language_loss": 0.82973564, + "learning_rate": 2.939141814846179e-06, + "loss": 0.85782254, + "num_input_tokens_seen": 65021215, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.765625, + "step": 3028, + "time_per_iteration": 3.1381983757019043 + }, + { + "auxiliary_loss_clip": 0.01515737, + "auxiliary_loss_mlp": 0.01317788, + "balance_loss_clip": 1.15688372, + "balance_loss_mlp": 1.03912473, + "epoch": 0.3642157157458065, + "flos": 17714827248480.0, + "grad_norm": 1.834170372569274, + "language_loss": 0.82674384, + "learning_rate": 2.938453993641938e-06, + "loss": 0.85507905, + "num_input_tokens_seen": 65039590, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.79101562, + "step": 3029, + "time_per_iteration": 3.004546880722046 + }, + { + "auxiliary_loss_clip": 0.01527944, + "auxiliary_loss_mlp": 0.01300436, + "balance_loss_clip": 1.16720653, + "balance_loss_mlp": 1.02406168, + "epoch": 0.36433595863644563, + "flos": 17641473458400.0, + "grad_norm": 2.4181030626751965, + "language_loss": 0.70451605, + "learning_rate": 2.937766030074973e-06, + "loss": 0.73279983, + "num_input_tokens_seen": 65056845, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.76757812, + "step": 3030, + "time_per_iteration": 3.0912537574768066 + }, + { + "auxiliary_loss_clip": 0.01516211, + "auxiliary_loss_mlp": 0.01309813, + "balance_loss_clip": 1.15561438, + "balance_loss_mlp": 1.03267527, + "epoch": 0.3644562015270847, + "flos": 26835692604480.0, + "grad_norm": 2.0514721062084416, + "language_loss": 0.82657468, + "learning_rate": 2.937077924249646e-06, + "loss": 0.85483491, + "num_input_tokens_seen": 65079435, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.77539062, + "step": 3031, + "time_per_iteration": 3.2623329162597656 + }, + { + "auxiliary_loss_clip": 0.01514237, + "auxiliary_loss_mlp": 0.01300053, + "balance_loss_clip": 1.15514851, + "balance_loss_mlp": 1.02272463, + "epoch": 0.3645764444177238, + "flos": 14284456427520.0, + "grad_norm": 2.868205431836209, + "language_loss": 0.75355011, + "learning_rate": 2.9363896762703443e-06, + "loss": 0.78169304, + "num_input_tokens_seen": 65096500, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.77734375, + "step": 3032, + "time_per_iteration": 3.012239933013916 + }, + { + "auxiliary_loss_clip": 0.01519649, + "auxiliary_loss_mlp": 0.01323031, + "balance_loss_clip": 1.15968466, + "balance_loss_mlp": 1.04226947, + "epoch": 0.3646966873083629, + "flos": 20669574206880.0, + "grad_norm": 1.8352974497884647, + "language_loss": 0.84372354, + "learning_rate": 2.9357012862414725e-06, + "loss": 0.8721503, + "num_input_tokens_seen": 65115860, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.8125, + "step": 3033, + "time_per_iteration": 3.0247228145599365 + }, + { + "auxiliary_loss_clip": 0.0151912, + "auxiliary_loss_mlp": 0.01305444, + "balance_loss_clip": 1.15981281, + "balance_loss_mlp": 1.02639842, + "epoch": 0.36481693019900197, + "flos": 27785119777920.0, + "grad_norm": 2.3143210306452664, + "language_loss": 0.71735686, + "learning_rate": 2.9350127542674593e-06, + "loss": 0.74560249, + "num_input_tokens_seen": 65138070, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.79492188, + "step": 3034, + "time_per_iteration": 3.975961685180664 + }, + { + "auxiliary_loss_clip": 0.01521439, + "auxiliary_loss_mlp": 0.01301272, + "balance_loss_clip": 1.16183472, + "balance_loss_mlp": 1.02184522, + "epoch": 0.3649371730896411, + "flos": 19714268168640.0, + "grad_norm": 1.9472934863701419, + "language_loss": 0.76553762, + "learning_rate": 2.934324080452755e-06, + "loss": 0.79376471, + "num_input_tokens_seen": 65155860, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.79882812, + "step": 3035, + "time_per_iteration": 3.0676591396331787 + }, + { + "auxiliary_loss_clip": 0.01519689, + "auxiliary_loss_mlp": 0.01311253, + "balance_loss_clip": 1.15821338, + "balance_loss_mlp": 1.03125453, + "epoch": 0.3650574159802802, + "flos": 24752619362880.0, + "grad_norm": 1.8883167250948534, + "language_loss": 0.77814806, + "learning_rate": 2.9336352649018307e-06, + "loss": 0.80645746, + "num_input_tokens_seen": 65175930, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.8046875, + "step": 3036, + "time_per_iteration": 3.201329231262207 + }, + { + "auxiliary_loss_clip": 0.01524004, + "auxiliary_loss_mlp": 0.01318423, + "balance_loss_clip": 1.16345, + "balance_loss_mlp": 1.03880572, + "epoch": 0.36517765887091924, + "flos": 32856241266720.0, + "grad_norm": 1.952938792596969, + "language_loss": 0.70128191, + "learning_rate": 2.9329463077191783e-06, + "loss": 0.72970617, + "num_input_tokens_seen": 65199305, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.80078125, + "step": 3037, + "time_per_iteration": 3.0579330921173096 + }, + { + "auxiliary_loss_clip": 0.01516479, + "auxiliary_loss_mlp": 0.0131019, + "balance_loss_clip": 1.15587175, + "balance_loss_mlp": 1.0347693, + "epoch": 0.36529790176155835, + "flos": 20122379177760.0, + "grad_norm": 3.2364035019434785, + "language_loss": 0.64327985, + "learning_rate": 2.9322572090093135e-06, + "loss": 0.67154658, + "num_input_tokens_seen": 65218010, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.7578125, + "step": 3038, + "time_per_iteration": 3.8960437774658203 + }, + { + "auxiliary_loss_clip": 0.015151, + "auxiliary_loss_mlp": 0.01310684, + "balance_loss_clip": 1.1545614, + "balance_loss_mlp": 1.03182983, + "epoch": 0.36541814465219746, + "flos": 17641511386560.0, + "grad_norm": 3.828075301388965, + "language_loss": 0.76123333, + "learning_rate": 2.9315679688767713e-06, + "loss": 0.78949112, + "num_input_tokens_seen": 65236020, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.79296875, + "step": 3039, + "time_per_iteration": 3.155406951904297 + }, + { + "auxiliary_loss_clip": 0.01510911, + "auxiliary_loss_mlp": 0.01290874, + "balance_loss_clip": 1.14938581, + "balance_loss_mlp": 1.01735997, + "epoch": 0.3655383875428365, + "flos": 22676562830880.0, + "grad_norm": 2.2606851598234936, + "language_loss": 0.66591609, + "learning_rate": 2.9308785874261085e-06, + "loss": 0.69393396, + "num_input_tokens_seen": 65256210, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.73828125, + "step": 3040, + "time_per_iteration": 3.1143338680267334 + }, + { + "auxiliary_loss_clip": 0.01514507, + "auxiliary_loss_mlp": 0.01307416, + "balance_loss_clip": 1.15360653, + "balance_loss_mlp": 1.03294849, + "epoch": 0.36565863043347563, + "flos": 21983342928480.0, + "grad_norm": 2.701285045260819, + "language_loss": 0.81398195, + "learning_rate": 2.9301890647619045e-06, + "loss": 0.84220117, + "num_input_tokens_seen": 65275505, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.74804688, + "step": 3041, + "time_per_iteration": 3.0640087127685547 + }, + { + "auxiliary_loss_clip": 0.01511651, + "auxiliary_loss_mlp": 0.01308409, + "balance_loss_clip": 1.15186024, + "balance_loss_mlp": 1.02936435, + "epoch": 0.36577887332411474, + "flos": 24829993537920.0, + "grad_norm": 2.650785896937603, + "language_loss": 0.80834103, + "learning_rate": 2.929499400988759e-06, + "loss": 0.83654165, + "num_input_tokens_seen": 65296665, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.79492188, + "step": 3042, + "time_per_iteration": 3.816987991333008 + }, + { + "auxiliary_loss_clip": 0.01508627, + "auxiliary_loss_mlp": 0.01306155, + "balance_loss_clip": 1.1481545, + "balance_loss_mlp": 1.02749181, + "epoch": 0.3658991162147538, + "flos": 28295713768320.0, + "grad_norm": 2.5292662942003337, + "language_loss": 0.65446901, + "learning_rate": 2.9288095962112927e-06, + "loss": 0.68261683, + "num_input_tokens_seen": 65317370, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.79101562, + "step": 3043, + "time_per_iteration": 3.0138800144195557 + }, + { + "auxiliary_loss_clip": 0.01516543, + "auxiliary_loss_mlp": 0.01305091, + "balance_loss_clip": 1.1562959, + "balance_loss_mlp": 1.0273807, + "epoch": 0.3660193591053929, + "flos": 17787498331680.0, + "grad_norm": 2.134452032069567, + "language_loss": 0.85411215, + "learning_rate": 2.9281196505341503e-06, + "loss": 0.88232845, + "num_input_tokens_seen": 65334540, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.78125, + "step": 3044, + "time_per_iteration": 2.987819194793701 + }, + { + "auxiliary_loss_clip": 0.01510618, + "auxiliary_loss_mlp": 0.01306104, + "balance_loss_clip": 1.15037656, + "balance_loss_mlp": 1.03087306, + "epoch": 0.36613960199603196, + "flos": 10344326035680.0, + "grad_norm": 1.9820225441080912, + "language_loss": 0.7862258, + "learning_rate": 2.9274295640619946e-06, + "loss": 0.81439304, + "num_input_tokens_seen": 65351670, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.75585938, + "step": 3045, + "time_per_iteration": 2.951756000518799 + }, + { + "auxiliary_loss_clip": 0.01511385, + "auxiliary_loss_mlp": 0.01298608, + "balance_loss_clip": 1.15087366, + "balance_loss_mlp": 1.02585673, + "epoch": 0.36625984488667107, + "flos": 19757658348000.0, + "grad_norm": 1.8726432827142716, + "language_loss": 0.79079473, + "learning_rate": 2.9267393368995103e-06, + "loss": 0.81889462, + "num_input_tokens_seen": 65370900, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.73046875, + "step": 3046, + "time_per_iteration": 2.9440040588378906 + }, + { + "auxiliary_loss_clip": 0.01508776, + "auxiliary_loss_mlp": 0.01305197, + "balance_loss_clip": 1.14844227, + "balance_loss_mlp": 1.02805948, + "epoch": 0.3663800877773102, + "flos": 17676102232800.0, + "grad_norm": 2.8682924053874324, + "language_loss": 0.74534935, + "learning_rate": 2.926048969151407e-06, + "loss": 0.77348912, + "num_input_tokens_seen": 65388185, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.77539062, + "step": 3047, + "time_per_iteration": 3.1210033893585205 + }, + { + "auxiliary_loss_clip": 0.01510634, + "auxiliary_loss_mlp": 0.01301136, + "balance_loss_clip": 1.14991164, + "balance_loss_mlp": 1.02285349, + "epoch": 0.36650033066794924, + "flos": 20305725724800.0, + "grad_norm": 1.9441324766986077, + "language_loss": 0.68723738, + "learning_rate": 2.92535846092241e-06, + "loss": 0.7153551, + "num_input_tokens_seen": 65407200, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.78710938, + "step": 3048, + "time_per_iteration": 3.100688934326172 + }, + { + "auxiliary_loss_clip": 0.01510227, + "auxiliary_loss_mlp": 0.0130522, + "balance_loss_clip": 1.15019178, + "balance_loss_mlp": 1.02903605, + "epoch": 0.36662057355858835, + "flos": 24718483654560.0, + "grad_norm": 1.7601518771048938, + "language_loss": 0.82503402, + "learning_rate": 2.9246678123172704e-06, + "loss": 0.85318857, + "num_input_tokens_seen": 65427290, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.765625, + "step": 3049, + "time_per_iteration": 3.0582191944122314 + }, + { + "auxiliary_loss_clip": 0.01510082, + "auxiliary_loss_mlp": 0.01298926, + "balance_loss_clip": 1.14922941, + "balance_loss_mlp": 1.0238862, + "epoch": 0.36674081644922746, + "flos": 12386891638080.0, + "grad_norm": 2.888084134849974, + "language_loss": 0.75017387, + "learning_rate": 2.9239770234407596e-06, + "loss": 0.77826393, + "num_input_tokens_seen": 65445595, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.75390625, + "step": 3050, + "time_per_iteration": 3.068746328353882 + }, + { + "auxiliary_loss_clip": 0.01507558, + "auxiliary_loss_mlp": 0.01307099, + "balance_loss_clip": 1.14639425, + "balance_loss_mlp": 1.02900743, + "epoch": 0.3668610593398665, + "flos": 21108066036480.0, + "grad_norm": 1.7822165841980948, + "language_loss": 0.68996537, + "learning_rate": 2.9232860943976686e-06, + "loss": 0.71811193, + "num_input_tokens_seen": 65466330, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.78515625, + "step": 3051, + "time_per_iteration": 3.04365873336792 + }, + { + "auxiliary_loss_clip": 0.01511095, + "auxiliary_loss_mlp": 0.01307552, + "balance_loss_clip": 1.14967775, + "balance_loss_mlp": 1.03308415, + "epoch": 0.3669813022305056, + "flos": 26760366550080.0, + "grad_norm": 2.3218336446601455, + "language_loss": 0.84458709, + "learning_rate": 2.9225950252928115e-06, + "loss": 0.87277359, + "num_input_tokens_seen": 65487180, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.74804688, + "step": 3052, + "time_per_iteration": 3.040076494216919 + }, + { + "auxiliary_loss_clip": 0.01512069, + "auxiliary_loss_mlp": 0.01301531, + "balance_loss_clip": 1.15108538, + "balance_loss_mlp": 1.0228672, + "epoch": 0.36710154512114473, + "flos": 19101570478560.0, + "grad_norm": 2.5946405422921703, + "language_loss": 0.82044816, + "learning_rate": 2.9219038162310217e-06, + "loss": 0.84858418, + "num_input_tokens_seen": 65505380, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.79101562, + "step": 3053, + "time_per_iteration": 2.994319200515747 + }, + { + "auxiliary_loss_clip": 0.0151218, + "auxiliary_loss_mlp": 0.0129602, + "balance_loss_clip": 1.1518904, + "balance_loss_mlp": 1.01964533, + "epoch": 0.3672217880117838, + "flos": 20814044025600.0, + "grad_norm": 2.086381443692437, + "language_loss": 0.82713294, + "learning_rate": 2.921212467317157e-06, + "loss": 0.85521495, + "num_input_tokens_seen": 65524825, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.76757812, + "step": 3054, + "time_per_iteration": 2.9768970012664795 + }, + { + "auxiliary_loss_clip": 0.01515931, + "auxiliary_loss_mlp": 0.01304926, + "balance_loss_clip": 1.15513682, + "balance_loss_mlp": 1.02702487, + "epoch": 0.3673420309024229, + "flos": 13592146800960.0, + "grad_norm": 2.099218417478249, + "language_loss": 0.80213946, + "learning_rate": 2.920520978656093e-06, + "loss": 0.83034801, + "num_input_tokens_seen": 65541790, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.78320312, + "step": 3055, + "time_per_iteration": 3.0504162311553955 + }, + { + "auxiliary_loss_clip": 0.01514333, + "auxiliary_loss_mlp": 0.01305941, + "balance_loss_clip": 1.15390635, + "balance_loss_mlp": 1.02823091, + "epoch": 0.367462273793062, + "flos": 28989464664960.0, + "grad_norm": 2.0400657421983297, + "language_loss": 0.76710945, + "learning_rate": 2.919829350352729e-06, + "loss": 0.79531217, + "num_input_tokens_seen": 65563395, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.78125, + "step": 3056, + "time_per_iteration": 3.0381557941436768 + }, + { + "auxiliary_loss_clip": 0.01581075, + "auxiliary_loss_mlp": 0.01250214, + "balance_loss_clip": 1.22332668, + "balance_loss_mlp": 1.02285767, + "epoch": 0.36758251668370107, + "flos": 62648539309440.0, + "grad_norm": 0.7650891549605616, + "language_loss": 0.59854233, + "learning_rate": 2.919137582511983e-06, + "loss": 0.62685519, + "num_input_tokens_seen": 65619835, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.2734375, + "step": 3057, + "time_per_iteration": 3.4120850563049316 + }, + { + "auxiliary_loss_clip": 0.01515073, + "auxiliary_loss_mlp": 0.01320641, + "balance_loss_clip": 1.15509653, + "balance_loss_mlp": 1.040833, + "epoch": 0.3677027595743402, + "flos": 12715542423360.0, + "grad_norm": 1.9658803911291434, + "language_loss": 0.64058465, + "learning_rate": 2.918445675238797e-06, + "loss": 0.66894174, + "num_input_tokens_seen": 65636760, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.80273438, + "step": 3058, + "time_per_iteration": 3.009645700454712 + }, + { + "auxiliary_loss_clip": 0.0151001, + "auxiliary_loss_mlp": 0.01297939, + "balance_loss_clip": 1.14986384, + "balance_loss_mlp": 1.02080131, + "epoch": 0.36782300246497923, + "flos": 25048954991520.0, + "grad_norm": 8.985187593881781, + "language_loss": 0.69890702, + "learning_rate": 2.917753628638132e-06, + "loss": 0.72698647, + "num_input_tokens_seen": 65657065, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.77539062, + "step": 3059, + "time_per_iteration": 2.9998717308044434 + }, + { + "auxiliary_loss_clip": 0.01521942, + "auxiliary_loss_mlp": 0.01301044, + "balance_loss_clip": 1.16254294, + "balance_loss_mlp": 1.02390599, + "epoch": 0.36794324535561834, + "flos": 17421374160000.0, + "grad_norm": 2.5163314286256173, + "language_loss": 0.700378, + "learning_rate": 2.9170614428149716e-06, + "loss": 0.72860783, + "num_input_tokens_seen": 65675400, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.77539062, + "step": 3060, + "time_per_iteration": 3.0213258266448975 + }, + { + "auxiliary_loss_clip": 0.01530542, + "auxiliary_loss_mlp": 0.01312326, + "balance_loss_clip": 1.16980219, + "balance_loss_mlp": 1.0328989, + "epoch": 0.36806348824625745, + "flos": 24090956053920.0, + "grad_norm": 2.5041492546933943, + "language_loss": 0.86960137, + "learning_rate": 2.9163691178743195e-06, + "loss": 0.89803004, + "num_input_tokens_seen": 65694050, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.79882812, + "step": 3061, + "time_per_iteration": 3.8755085468292236 + }, + { + "auxiliary_loss_clip": 0.01516686, + "auxiliary_loss_mlp": 0.0130464, + "balance_loss_clip": 1.15726352, + "balance_loss_mlp": 1.02921915, + "epoch": 0.3681837311368965, + "flos": 20523966543360.0, + "grad_norm": 2.332981454018855, + "language_loss": 0.78023815, + "learning_rate": 2.9156766539212006e-06, + "loss": 0.80845141, + "num_input_tokens_seen": 65711695, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.7578125, + "step": 3062, + "time_per_iteration": 3.9396157264709473 + }, + { + "auxiliary_loss_clip": 0.01516385, + "auxiliary_loss_mlp": 0.01312597, + "balance_loss_clip": 1.15685999, + "balance_loss_mlp": 1.03908348, + "epoch": 0.3683039740275356, + "flos": 21468576840480.0, + "grad_norm": 2.381060887739241, + "language_loss": 0.72236747, + "learning_rate": 2.9149840510606614e-06, + "loss": 0.75065732, + "num_input_tokens_seen": 65730350, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.73828125, + "step": 3063, + "time_per_iteration": 3.0234994888305664 + }, + { + "auxiliary_loss_clip": 0.01602407, + "auxiliary_loss_mlp": 0.01221886, + "balance_loss_clip": 1.24595189, + "balance_loss_mlp": 1.00139618, + "epoch": 0.36842421691817473, + "flos": 70386985245600.0, + "grad_norm": 1.061268540395344, + "language_loss": 0.64119577, + "learning_rate": 2.914291309397769e-06, + "loss": 0.66943872, + "num_input_tokens_seen": 65787820, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.2109375, + "step": 3064, + "time_per_iteration": 3.6154935359954834 + }, + { + "auxiliary_loss_clip": 0.01520535, + "auxiliary_loss_mlp": 0.01298455, + "balance_loss_clip": 1.16123974, + "balance_loss_mlp": 1.02322435, + "epoch": 0.3685444598088138, + "flos": 23333636832480.0, + "grad_norm": 4.486300245331195, + "language_loss": 0.78362477, + "learning_rate": 2.9135984290376117e-06, + "loss": 0.81181461, + "num_input_tokens_seen": 65806685, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.75585938, + "step": 3065, + "time_per_iteration": 3.784695625305176 + }, + { + "auxiliary_loss_clip": 0.01523761, + "auxiliary_loss_mlp": 0.01306767, + "balance_loss_clip": 1.16436017, + "balance_loss_mlp": 1.03172684, + "epoch": 0.3686647026994529, + "flos": 23072195475360.0, + "grad_norm": 1.7792114622804904, + "language_loss": 0.82842189, + "learning_rate": 2.9129054100853e-06, + "loss": 0.85672718, + "num_input_tokens_seen": 65825525, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.75390625, + "step": 3066, + "time_per_iteration": 3.0350451469421387 + }, + { + "auxiliary_loss_clip": 0.0152375, + "auxiliary_loss_mlp": 0.01302111, + "balance_loss_clip": 1.16393518, + "balance_loss_mlp": 1.02440071, + "epoch": 0.368784945590092, + "flos": 25122308781600.0, + "grad_norm": 4.011689278983667, + "language_loss": 0.76091623, + "learning_rate": 2.912212252645963e-06, + "loss": 0.7891748, + "num_input_tokens_seen": 65848110, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.78125, + "step": 3067, + "time_per_iteration": 3.0914547443389893 + }, + { + "auxiliary_loss_clip": 0.01524188, + "auxiliary_loss_mlp": 0.01302145, + "balance_loss_clip": 1.16494799, + "balance_loss_mlp": 1.02271891, + "epoch": 0.36890518848073106, + "flos": 18444268908000.0, + "grad_norm": 5.968853734061318, + "language_loss": 0.76398039, + "learning_rate": 2.9115189568247523e-06, + "loss": 0.79224372, + "num_input_tokens_seen": 65865670, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.796875, + "step": 3068, + "time_per_iteration": 3.0285699367523193 + }, + { + "auxiliary_loss_clip": 0.015246, + "auxiliary_loss_mlp": 0.01307478, + "balance_loss_clip": 1.16664886, + "balance_loss_mlp": 1.0322479, + "epoch": 0.36902543137137017, + "flos": 16364305775520.0, + "grad_norm": 2.223419469241752, + "language_loss": 0.91882157, + "learning_rate": 2.910825522726841e-06, + "loss": 0.9471423, + "num_input_tokens_seen": 65883195, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.75585938, + "step": 3069, + "time_per_iteration": 3.866580009460449 + }, + { + "auxiliary_loss_clip": 0.01522559, + "auxiliary_loss_mlp": 0.01305992, + "balance_loss_clip": 1.16303754, + "balance_loss_mlp": 1.02999842, + "epoch": 0.3691456742620093, + "flos": 12277202306400.0, + "grad_norm": 2.1186343009944384, + "language_loss": 0.7703318, + "learning_rate": 2.9101319504574215e-06, + "loss": 0.79861724, + "num_input_tokens_seen": 65899635, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.76367188, + "step": 3070, + "time_per_iteration": 3.0619654655456543 + }, + { + "auxiliary_loss_clip": 0.01517475, + "auxiliary_loss_mlp": 0.0130676, + "balance_loss_clip": 1.15748596, + "balance_loss_mlp": 1.0311482, + "epoch": 0.36926591715264834, + "flos": 17788408607520.0, + "grad_norm": 1.9923160046781518, + "language_loss": 0.76140463, + "learning_rate": 2.909438240121709e-06, + "loss": 0.78964692, + "num_input_tokens_seen": 65919910, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.75976562, + "step": 3071, + "time_per_iteration": 3.0525825023651123 + }, + { + "auxiliary_loss_clip": 0.01524805, + "auxiliary_loss_mlp": 0.01302543, + "balance_loss_clip": 1.16501212, + "balance_loss_mlp": 1.02674031, + "epoch": 0.36938616004328745, + "flos": 28950777577440.0, + "grad_norm": 2.1752719705557375, + "language_loss": 0.71121073, + "learning_rate": 2.908744391824939e-06, + "loss": 0.73948419, + "num_input_tokens_seen": 65940930, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.76171875, + "step": 3072, + "time_per_iteration": 3.1460163593292236 + }, + { + "auxiliary_loss_clip": 0.01520508, + "auxiliary_loss_mlp": 0.01302367, + "balance_loss_clip": 1.16171765, + "balance_loss_mlp": 1.026564, + "epoch": 0.36950640293392656, + "flos": 29207970980640.0, + "grad_norm": 1.8263787557760967, + "language_loss": 0.79252172, + "learning_rate": 2.908050405672367e-06, + "loss": 0.82075042, + "num_input_tokens_seen": 65960475, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.76171875, + "step": 3073, + "time_per_iteration": 3.2113258838653564 + }, + { + "auxiliary_loss_clip": 0.01518082, + "auxiliary_loss_mlp": 0.01305885, + "balance_loss_clip": 1.15867233, + "balance_loss_mlp": 1.0300827, + "epoch": 0.3696266458245656, + "flos": 24830145250560.0, + "grad_norm": 1.7988254250926796, + "language_loss": 0.79245043, + "learning_rate": 2.9073562817692703e-06, + "loss": 0.8206901, + "num_input_tokens_seen": 65979160, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.76171875, + "step": 3074, + "time_per_iteration": 3.075432538986206 + }, + { + "auxiliary_loss_clip": 0.01626667, + "auxiliary_loss_mlp": 0.01214279, + "balance_loss_clip": 1.27003562, + "balance_loss_mlp": 0.99073792, + "epoch": 0.3697468887152047, + "flos": 59893448006880.0, + "grad_norm": 0.7351122012821537, + "language_loss": 0.5650546, + "learning_rate": 2.9066620202209468e-06, + "loss": 0.59346402, + "num_input_tokens_seen": 66041650, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.2421875, + "step": 3075, + "time_per_iteration": 3.3789899349212646 + }, + { + "auxiliary_loss_clip": 0.01529639, + "auxiliary_loss_mlp": 0.01313462, + "balance_loss_clip": 1.1696465, + "balance_loss_mlp": 1.03765941, + "epoch": 0.3698671316058438, + "flos": 26139817730880.0, + "grad_norm": 2.2114942334310537, + "language_loss": 0.77746129, + "learning_rate": 2.905967621132716e-06, + "loss": 0.80589229, + "num_input_tokens_seen": 66059260, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.76171875, + "step": 3076, + "time_per_iteration": 3.0397093296051025 + }, + { + "auxiliary_loss_clip": 0.01527955, + "auxiliary_loss_mlp": 0.01309287, + "balance_loss_clip": 1.1691047, + "balance_loss_mlp": 1.03195882, + "epoch": 0.3699873744964829, + "flos": 24609742526880.0, + "grad_norm": 3.228738457196656, + "language_loss": 0.75357449, + "learning_rate": 2.9052730846099172e-06, + "loss": 0.7819469, + "num_input_tokens_seen": 66080605, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.77734375, + "step": 3077, + "time_per_iteration": 2.958045482635498 + }, + { + "auxiliary_loss_clip": 0.01615569, + "auxiliary_loss_mlp": 0.01226433, + "balance_loss_clip": 1.25993299, + "balance_loss_mlp": 1.0021286, + "epoch": 0.370107617387122, + "flos": 64891784628000.0, + "grad_norm": 0.8535910919749301, + "language_loss": 0.60868758, + "learning_rate": 2.9045784107579123e-06, + "loss": 0.63710761, + "num_input_tokens_seen": 66140710, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.25, + "step": 3078, + "time_per_iteration": 3.4330568313598633 + }, + { + "auxiliary_loss_clip": 0.01525159, + "auxiliary_loss_mlp": 0.01307139, + "balance_loss_clip": 1.16807687, + "balance_loss_mlp": 1.02885675, + "epoch": 0.37022786027776106, + "flos": 15963552829440.0, + "grad_norm": 1.8870251992873412, + "language_loss": 0.67015052, + "learning_rate": 2.9038835996820807e-06, + "loss": 0.69847351, + "num_input_tokens_seen": 66158320, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.78710938, + "step": 3079, + "time_per_iteration": 2.881894588470459 + }, + { + "auxiliary_loss_clip": 0.01526779, + "auxiliary_loss_mlp": 0.01306399, + "balance_loss_clip": 1.1672914, + "balance_loss_mlp": 1.03193164, + "epoch": 0.37034810316840017, + "flos": 18548648297280.0, + "grad_norm": 1.7614934810049583, + "language_loss": 0.79308176, + "learning_rate": 2.903188651487826e-06, + "loss": 0.82141352, + "num_input_tokens_seen": 66176875, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.74804688, + "step": 3080, + "time_per_iteration": 2.945749282836914 + }, + { + "auxiliary_loss_clip": 0.01531376, + "auxiliary_loss_mlp": 0.01315748, + "balance_loss_clip": 1.17224371, + "balance_loss_mlp": 1.04013646, + "epoch": 0.3704683460590393, + "flos": 17823871801440.0, + "grad_norm": 2.270781513480341, + "language_loss": 0.86590725, + "learning_rate": 2.902493566280571e-06, + "loss": 0.89437854, + "num_input_tokens_seen": 66194980, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.75976562, + "step": 3081, + "time_per_iteration": 2.97287654876709 + }, + { + "auxiliary_loss_clip": 0.01523708, + "auxiliary_loss_mlp": 0.01315247, + "balance_loss_clip": 1.16436934, + "balance_loss_mlp": 1.03620219, + "epoch": 0.37058858894967833, + "flos": 14135662798560.0, + "grad_norm": 2.1696883683523334, + "language_loss": 0.81178951, + "learning_rate": 2.9017983441657595e-06, + "loss": 0.84017915, + "num_input_tokens_seen": 66212310, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.79492188, + "step": 3082, + "time_per_iteration": 2.913926601409912 + }, + { + "auxiliary_loss_clip": 0.01525695, + "auxiliary_loss_mlp": 0.01302227, + "balance_loss_clip": 1.16648602, + "balance_loss_mlp": 1.02680588, + "epoch": 0.37070883184031744, + "flos": 13956222852000.0, + "grad_norm": 2.5578175121907636, + "language_loss": 0.75348413, + "learning_rate": 2.9011029852488564e-06, + "loss": 0.78176332, + "num_input_tokens_seen": 66229545, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.7578125, + "step": 3083, + "time_per_iteration": 3.0380868911743164 + }, + { + "auxiliary_loss_clip": 0.01609432, + "auxiliary_loss_mlp": 0.012388, + "balance_loss_clip": 1.25260186, + "balance_loss_mlp": 1.01296997, + "epoch": 0.37082907473095655, + "flos": 52320105172800.0, + "grad_norm": 0.9797123451436643, + "language_loss": 0.62474644, + "learning_rate": 2.9004074896353465e-06, + "loss": 0.65322876, + "num_input_tokens_seen": 66283545, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.2578125, + "step": 3084, + "time_per_iteration": 3.348879098892212 + }, + { + "auxiliary_loss_clip": 0.01533708, + "auxiliary_loss_mlp": 0.01313817, + "balance_loss_clip": 1.17445183, + "balance_loss_mlp": 1.0370605, + "epoch": 0.3709493176215956, + "flos": 15999964227360.0, + "grad_norm": 2.7325199772610067, + "language_loss": 0.81780505, + "learning_rate": 2.8997118574307362e-06, + "loss": 0.84628034, + "num_input_tokens_seen": 66300500, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.77148438, + "step": 3085, + "time_per_iteration": 3.0974526405334473 + }, + { + "auxiliary_loss_clip": 0.0152846, + "auxiliary_loss_mlp": 0.01312504, + "balance_loss_clip": 1.16806293, + "balance_loss_mlp": 1.03441238, + "epoch": 0.3710695605122347, + "flos": 20961965306880.0, + "grad_norm": 2.0401890488993053, + "language_loss": 0.74269253, + "learning_rate": 2.899016088740553e-06, + "loss": 0.77110219, + "num_input_tokens_seen": 66318610, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.78515625, + "step": 3086, + "time_per_iteration": 3.066222667694092 + }, + { + "auxiliary_loss_clip": 0.01525963, + "auxiliary_loss_mlp": 0.01296957, + "balance_loss_clip": 1.16647375, + "balance_loss_mlp": 1.01905596, + "epoch": 0.37118980340287383, + "flos": 14357317151520.0, + "grad_norm": 2.472483055627894, + "language_loss": 0.78793138, + "learning_rate": 2.898320183670344e-06, + "loss": 0.81616056, + "num_input_tokens_seen": 66336025, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.78320312, + "step": 3087, + "time_per_iteration": 3.109729290008545 + }, + { + "auxiliary_loss_clip": 0.0152628, + "auxiliary_loss_mlp": 0.01301304, + "balance_loss_clip": 1.1661588, + "balance_loss_mlp": 1.02550125, + "epoch": 0.3713100462935129, + "flos": 25887782557440.0, + "grad_norm": 2.2793058900485765, + "language_loss": 0.88948655, + "learning_rate": 2.8976241423256767e-06, + "loss": 0.9177624, + "num_input_tokens_seen": 66356120, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.76171875, + "step": 3088, + "time_per_iteration": 3.2083191871643066 + }, + { + "auxiliary_loss_clip": 0.0153253, + "auxiliary_loss_mlp": 0.01306934, + "balance_loss_clip": 1.17320502, + "balance_loss_mlp": 1.03761601, + "epoch": 0.371430289184152, + "flos": 30521587989600.0, + "grad_norm": 2.441971947846145, + "language_loss": 0.68448877, + "learning_rate": 2.896927964812142e-06, + "loss": 0.71288347, + "num_input_tokens_seen": 66376685, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.6953125, + "step": 3089, + "time_per_iteration": 4.714641094207764 + }, + { + "auxiliary_loss_clip": 0.01527498, + "auxiliary_loss_mlp": 0.01309909, + "balance_loss_clip": 1.16853142, + "balance_loss_mlp": 1.03467882, + "epoch": 0.37155053207479105, + "flos": 15744060381600.0, + "grad_norm": 2.681700635825945, + "language_loss": 0.74364156, + "learning_rate": 2.8962316512353465e-06, + "loss": 0.77201569, + "num_input_tokens_seen": 66394230, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.75585938, + "step": 3090, + "time_per_iteration": 2.9634203910827637 + }, + { + "auxiliary_loss_clip": 0.01524074, + "auxiliary_loss_mlp": 0.01304451, + "balance_loss_clip": 1.16296124, + "balance_loss_mlp": 1.02902985, + "epoch": 0.37167077496543016, + "flos": 23406307915680.0, + "grad_norm": 1.645538665152468, + "language_loss": 0.7517519, + "learning_rate": 2.8955352017009233e-06, + "loss": 0.78003716, + "num_input_tokens_seen": 66413475, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.7578125, + "step": 3091, + "time_per_iteration": 2.9470083713531494 + }, + { + "auxiliary_loss_clip": 0.015266, + "auxiliary_loss_mlp": 0.01311062, + "balance_loss_clip": 1.16775751, + "balance_loss_mlp": 1.03239787, + "epoch": 0.3717910178560693, + "flos": 22090718642400.0, + "grad_norm": 2.349845078397642, + "language_loss": 0.77196902, + "learning_rate": 2.8948386163145212e-06, + "loss": 0.80034566, + "num_input_tokens_seen": 66432685, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.79101562, + "step": 3092, + "time_per_iteration": 3.792022705078125 + }, + { + "auxiliary_loss_clip": 0.01527225, + "auxiliary_loss_mlp": 0.01303896, + "balance_loss_clip": 1.16821957, + "balance_loss_mlp": 1.02752149, + "epoch": 0.3719112607467083, + "flos": 26942233898880.0, + "grad_norm": 1.8538177256927453, + "language_loss": 0.7956984, + "learning_rate": 2.8941418951818135e-06, + "loss": 0.82400966, + "num_input_tokens_seen": 66452245, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.76757812, + "step": 3093, + "time_per_iteration": 2.9542226791381836 + }, + { + "auxiliary_loss_clip": 0.0151884, + "auxiliary_loss_mlp": 0.01300298, + "balance_loss_clip": 1.15938962, + "balance_loss_mlp": 1.02583086, + "epoch": 0.37203150363734744, + "flos": 12168119825280.0, + "grad_norm": 2.4541666813089487, + "language_loss": 0.70933384, + "learning_rate": 2.8934450384084903e-06, + "loss": 0.73752522, + "num_input_tokens_seen": 66469760, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.74804688, + "step": 3094, + "time_per_iteration": 2.949181318283081 + }, + { + "auxiliary_loss_clip": 0.01516047, + "auxiliary_loss_mlp": 0.01307523, + "balance_loss_clip": 1.1569339, + "balance_loss_mlp": 1.03019404, + "epoch": 0.37215174652798655, + "flos": 23699950644960.0, + "grad_norm": 2.00981415347676, + "language_loss": 0.70266336, + "learning_rate": 2.8927480461002653e-06, + "loss": 0.73089904, + "num_input_tokens_seen": 66489730, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.77734375, + "step": 3095, + "time_per_iteration": 2.9681296348571777 + }, + { + "auxiliary_loss_clip": 0.01521342, + "auxiliary_loss_mlp": 0.01318468, + "balance_loss_clip": 1.1618048, + "balance_loss_mlp": 1.04209268, + "epoch": 0.3722719894186256, + "flos": 17889298606080.0, + "grad_norm": 8.004935543391879, + "language_loss": 0.86135137, + "learning_rate": 2.892050918362872e-06, + "loss": 0.88974953, + "num_input_tokens_seen": 66504785, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.76757812, + "step": 3096, + "time_per_iteration": 3.804229259490967 + }, + { + "auxiliary_loss_clip": 0.01596757, + "auxiliary_loss_mlp": 0.0120874, + "balance_loss_clip": 1.24003339, + "balance_loss_mlp": 0.98443604, + "epoch": 0.3723922323092647, + "flos": 62425026476640.0, + "grad_norm": 0.8541713380409353, + "language_loss": 0.55785859, + "learning_rate": 2.8913536553020626e-06, + "loss": 0.58591354, + "num_input_tokens_seen": 66558840, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.25, + "step": 3097, + "time_per_iteration": 3.439873456954956 + }, + { + "auxiliary_loss_clip": 0.01517676, + "auxiliary_loss_mlp": 0.01310232, + "balance_loss_clip": 1.15833735, + "balance_loss_mlp": 1.03423846, + "epoch": 0.3725124751999038, + "flos": 23042004295680.0, + "grad_norm": 2.1089718705081655, + "language_loss": 0.85028839, + "learning_rate": 2.8906562570236137e-06, + "loss": 0.87856752, + "num_input_tokens_seen": 66576750, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.76367188, + "step": 3098, + "time_per_iteration": 2.9543521404266357 + }, + { + "auxiliary_loss_clip": 0.01516494, + "auxiliary_loss_mlp": 0.01297961, + "balance_loss_clip": 1.15810668, + "balance_loss_mlp": 1.02482879, + "epoch": 0.3726327180905429, + "flos": 20922519656160.0, + "grad_norm": 1.780108746024899, + "language_loss": 0.76205438, + "learning_rate": 2.889958723633318e-06, + "loss": 0.79019892, + "num_input_tokens_seen": 66595690, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.734375, + "step": 3099, + "time_per_iteration": 2.9331870079040527 + }, + { + "auxiliary_loss_clip": 0.01518032, + "auxiliary_loss_mlp": 0.01309421, + "balance_loss_clip": 1.15809822, + "balance_loss_mlp": 1.03457212, + "epoch": 0.372752960981182, + "flos": 30594827995200.0, + "grad_norm": 1.6387387837819676, + "language_loss": 0.73755467, + "learning_rate": 2.889261055236992e-06, + "loss": 0.76582915, + "num_input_tokens_seen": 66617905, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.75195312, + "step": 3100, + "time_per_iteration": 3.0480501651763916 + }, + { + "auxiliary_loss_clip": 0.01517965, + "auxiliary_loss_mlp": 0.01313942, + "balance_loss_clip": 1.16041267, + "balance_loss_mlp": 1.03604186, + "epoch": 0.3728732038718211, + "flos": 25118971103520.0, + "grad_norm": 2.0710595136527137, + "language_loss": 0.8299579, + "learning_rate": 2.8885632519404704e-06, + "loss": 0.85827702, + "num_input_tokens_seen": 66638175, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.78320312, + "step": 3101, + "time_per_iteration": 2.929133892059326 + }, + { + "auxiliary_loss_clip": 0.01510808, + "auxiliary_loss_mlp": 0.01302691, + "balance_loss_clip": 1.15082967, + "balance_loss_mlp": 1.02707863, + "epoch": 0.37299344676246016, + "flos": 25304441627520.0, + "grad_norm": 2.1415873146693873, + "language_loss": 0.76208186, + "learning_rate": 2.8878653138496107e-06, + "loss": 0.79021686, + "num_input_tokens_seen": 66658670, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.75976562, + "step": 3102, + "time_per_iteration": 3.132751703262329 + }, + { + "auxiliary_loss_clip": 0.01507603, + "auxiliary_loss_mlp": 0.01317099, + "balance_loss_clip": 1.14669561, + "balance_loss_mlp": 1.03862596, + "epoch": 0.37311368965309927, + "flos": 23844837673440.0, + "grad_norm": 5.116984854398471, + "language_loss": 0.76828945, + "learning_rate": 2.8871672410702878e-06, + "loss": 0.79653645, + "num_input_tokens_seen": 66676030, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.7890625, + "step": 3103, + "time_per_iteration": 2.9610493183135986 + }, + { + "auxiliary_loss_clip": 0.0151751, + "auxiliary_loss_mlp": 0.01306568, + "balance_loss_clip": 1.15763533, + "balance_loss_mlp": 1.0313375, + "epoch": 0.3732339325437384, + "flos": 25814314982880.0, + "grad_norm": 1.832590754857674, + "language_loss": 0.82229245, + "learning_rate": 2.8864690337084008e-06, + "loss": 0.85053325, + "num_input_tokens_seen": 66695305, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.75585938, + "step": 3104, + "time_per_iteration": 2.9569997787475586 + }, + { + "auxiliary_loss_clip": 0.01511772, + "auxiliary_loss_mlp": 0.01297469, + "balance_loss_clip": 1.15234768, + "balance_loss_mlp": 1.02529073, + "epoch": 0.37335417543437743, + "flos": 26210554477920.0, + "grad_norm": 2.0830498412739833, + "language_loss": 0.78207874, + "learning_rate": 2.885770691869866e-06, + "loss": 0.81017113, + "num_input_tokens_seen": 66716185, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.72460938, + "step": 3105, + "time_per_iteration": 3.1418356895446777 + }, + { + "auxiliary_loss_clip": 0.01513787, + "auxiliary_loss_mlp": 0.01306255, + "balance_loss_clip": 1.15370619, + "balance_loss_mlp": 1.03331375, + "epoch": 0.37347441832501654, + "flos": 24026136099840.0, + "grad_norm": 9.207572376622467, + "language_loss": 0.74223888, + "learning_rate": 2.8850722156606207e-06, + "loss": 0.77043927, + "num_input_tokens_seen": 66734575, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.73242188, + "step": 3106, + "time_per_iteration": 2.978163242340088 + }, + { + "auxiliary_loss_clip": 0.01516706, + "auxiliary_loss_mlp": 0.01306236, + "balance_loss_clip": 1.15605509, + "balance_loss_mlp": 1.03424835, + "epoch": 0.3735946612156556, + "flos": 19716733499040.0, + "grad_norm": 1.6927207165125204, + "language_loss": 0.66974902, + "learning_rate": 2.8843736051866252e-06, + "loss": 0.6979785, + "num_input_tokens_seen": 66753500, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.72265625, + "step": 3107, + "time_per_iteration": 2.9740970134735107 + }, + { + "auxiliary_loss_clip": 0.01516939, + "auxiliary_loss_mlp": 0.01305484, + "balance_loss_clip": 1.1572361, + "balance_loss_mlp": 1.03254247, + "epoch": 0.3737149041062947, + "flos": 23041852583040.0, + "grad_norm": 1.4986238071856504, + "language_loss": 0.69231784, + "learning_rate": 2.8836748605538557e-06, + "loss": 0.72054207, + "num_input_tokens_seen": 66775140, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.73242188, + "step": 3108, + "time_per_iteration": 2.9887850284576416 + }, + { + "auxiliary_loss_clip": 0.01521197, + "auxiliary_loss_mlp": 0.01311176, + "balance_loss_clip": 1.16121125, + "balance_loss_mlp": 1.03422892, + "epoch": 0.3738351469969338, + "flos": 34680983260320.0, + "grad_norm": 2.982496973196934, + "language_loss": 0.63238978, + "learning_rate": 2.882975981868313e-06, + "loss": 0.66071349, + "num_input_tokens_seen": 66795525, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.7734375, + "step": 3109, + "time_per_iteration": 3.020145893096924 + }, + { + "auxiliary_loss_clip": 0.01518737, + "auxiliary_loss_mlp": 0.01312555, + "balance_loss_clip": 1.15862298, + "balance_loss_mlp": 1.03656173, + "epoch": 0.3739553898875729, + "flos": 43511353924320.0, + "grad_norm": 2.8159930805456757, + "language_loss": 0.68825233, + "learning_rate": 2.882276969236016e-06, + "loss": 0.71656525, + "num_input_tokens_seen": 66816885, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.76367188, + "step": 3110, + "time_per_iteration": 3.1911232471466064 + }, + { + "auxiliary_loss_clip": 0.01517748, + "auxiliary_loss_mlp": 0.01299826, + "balance_loss_clip": 1.15841722, + "balance_loss_mlp": 1.02688444, + "epoch": 0.374075632778212, + "flos": 12857964121440.0, + "grad_norm": 2.4394172957654865, + "language_loss": 0.76776934, + "learning_rate": 2.881577822763005e-06, + "loss": 0.79594505, + "num_input_tokens_seen": 66834835, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.73242188, + "step": 3111, + "time_per_iteration": 2.954777956008911 + }, + { + "auxiliary_loss_clip": 0.01509644, + "auxiliary_loss_mlp": 0.0129749, + "balance_loss_clip": 1.15006065, + "balance_loss_mlp": 1.02302289, + "epoch": 0.3741958756688511, + "flos": 26026032157920.0, + "grad_norm": 2.4828837518405495, + "language_loss": 0.87643981, + "learning_rate": 2.880878542555338e-06, + "loss": 0.90451115, + "num_input_tokens_seen": 66852600, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.74804688, + "step": 3112, + "time_per_iteration": 2.9941842555999756 + }, + { + "auxiliary_loss_clip": 0.01517277, + "auxiliary_loss_mlp": 0.01315528, + "balance_loss_clip": 1.15772223, + "balance_loss_mlp": 1.04010701, + "epoch": 0.37431611855949015, + "flos": 21436034114880.0, + "grad_norm": 2.987757220456187, + "language_loss": 0.81033659, + "learning_rate": 2.8801791287190976e-06, + "loss": 0.83866465, + "num_input_tokens_seen": 66870595, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.7578125, + "step": 3113, + "time_per_iteration": 2.9487709999084473 + }, + { + "auxiliary_loss_clip": 0.01521666, + "auxiliary_loss_mlp": 0.01307664, + "balance_loss_clip": 1.16139603, + "balance_loss_mlp": 1.02880979, + "epoch": 0.37443636145012926, + "flos": 24209406790560.0, + "grad_norm": 3.024061333820237, + "language_loss": 0.8613804, + "learning_rate": 2.8794795813603817e-06, + "loss": 0.88967371, + "num_input_tokens_seen": 66886060, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.79296875, + "step": 3114, + "time_per_iteration": 2.9622297286987305 + }, + { + "auxiliary_loss_clip": 0.01520157, + "auxiliary_loss_mlp": 0.01297869, + "balance_loss_clip": 1.16062713, + "balance_loss_mlp": 1.02378273, + "epoch": 0.3745566043407684, + "flos": 15380477396640.0, + "grad_norm": 2.0256568691655485, + "language_loss": 0.81372058, + "learning_rate": 2.878779900585314e-06, + "loss": 0.84190083, + "num_input_tokens_seen": 66903900, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.74414062, + "step": 3115, + "time_per_iteration": 2.927579402923584 + }, + { + "auxiliary_loss_clip": 0.01524022, + "auxiliary_loss_mlp": 0.01310836, + "balance_loss_clip": 1.16402066, + "balance_loss_mlp": 1.03331685, + "epoch": 0.37467684723140743, + "flos": 24610539018240.0, + "grad_norm": 1.5000602113449717, + "language_loss": 0.75216079, + "learning_rate": 2.8780800865000336e-06, + "loss": 0.78050935, + "num_input_tokens_seen": 66925210, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.77929688, + "step": 3116, + "time_per_iteration": 3.7675704956054688 + }, + { + "auxiliary_loss_clip": 0.01593896, + "auxiliary_loss_mlp": 0.01250526, + "balance_loss_clip": 1.23693204, + "balance_loss_mlp": 1.02317047, + "epoch": 0.37479709012204654, + "flos": 64384073177760.0, + "grad_norm": 1.0000916532731352, + "language_loss": 0.59210289, + "learning_rate": 2.877380139210702e-06, + "loss": 0.62054706, + "num_input_tokens_seen": 66983880, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.2734375, + "step": 3117, + "time_per_iteration": 4.229512929916382 + }, + { + "auxiliary_loss_clip": 0.01525988, + "auxiliary_loss_mlp": 0.01309523, + "balance_loss_clip": 1.16673553, + "balance_loss_mlp": 1.03257585, + "epoch": 0.37491733301268565, + "flos": 23806150585920.0, + "grad_norm": 2.0170273166957697, + "language_loss": 0.76301503, + "learning_rate": 2.876680058823501e-06, + "loss": 0.79137021, + "num_input_tokens_seen": 67004280, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.7734375, + "step": 3118, + "time_per_iteration": 2.992141008377075 + }, + { + "auxiliary_loss_clip": 0.01517938, + "auxiliary_loss_mlp": 0.01306946, + "balance_loss_clip": 1.15799618, + "balance_loss_mlp": 1.0279007, + "epoch": 0.3750375759033247, + "flos": 32163704071200.0, + "grad_norm": 2.246671884971853, + "language_loss": 0.66162193, + "learning_rate": 2.8759798454446314e-06, + "loss": 0.68987072, + "num_input_tokens_seen": 67027445, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.79492188, + "step": 3119, + "time_per_iteration": 3.0181970596313477 + }, + { + "auxiliary_loss_clip": 0.0151936, + "auxiliary_loss_mlp": 0.01302313, + "balance_loss_clip": 1.16079211, + "balance_loss_mlp": 1.03089714, + "epoch": 0.3751578187939638, + "flos": 23370048230400.0, + "grad_norm": 2.038095297845626, + "language_loss": 0.81685543, + "learning_rate": 2.8752794991803173e-06, + "loss": 0.84507221, + "num_input_tokens_seen": 67045130, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.71679688, + "step": 3120, + "time_per_iteration": 3.8006887435913086 + }, + { + "auxiliary_loss_clip": 0.01526114, + "auxiliary_loss_mlp": 0.01300598, + "balance_loss_clip": 1.16814935, + "balance_loss_mlp": 1.02937317, + "epoch": 0.37527806168460287, + "flos": 14607190419840.0, + "grad_norm": 2.673399817950645, + "language_loss": 0.74965966, + "learning_rate": 2.8745790201367976e-06, + "loss": 0.77792668, + "num_input_tokens_seen": 67060885, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.71484375, + "step": 3121, + "time_per_iteration": 2.9303739070892334 + }, + { + "auxiliary_loss_clip": 0.0152036, + "auxiliary_loss_mlp": 0.01302147, + "balance_loss_clip": 1.16140711, + "balance_loss_mlp": 1.02825165, + "epoch": 0.375398304575242, + "flos": 26392952820960.0, + "grad_norm": 2.3560741546831183, + "language_loss": 0.84313083, + "learning_rate": 2.8738784084203373e-06, + "loss": 0.87135589, + "num_input_tokens_seen": 67080960, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.7421875, + "step": 3122, + "time_per_iteration": 3.01448655128479 + }, + { + "auxiliary_loss_clip": 0.01523693, + "auxiliary_loss_mlp": 0.01303261, + "balance_loss_clip": 1.16475105, + "balance_loss_mlp": 1.03070116, + "epoch": 0.3755185474658811, + "flos": 22238791636320.0, + "grad_norm": 1.8393973994819308, + "language_loss": 0.78887296, + "learning_rate": 2.873177664137216e-06, + "loss": 0.81714255, + "num_input_tokens_seen": 67101890, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.72851562, + "step": 3123, + "time_per_iteration": 3.7938778400421143 + }, + { + "auxiliary_loss_clip": 0.01525222, + "auxiliary_loss_mlp": 0.01300943, + "balance_loss_clip": 1.16731715, + "balance_loss_mlp": 1.02781105, + "epoch": 0.37563879035652015, + "flos": 30815723784960.0, + "grad_norm": 2.257270846063013, + "language_loss": 0.69629931, + "learning_rate": 2.8724767873937384e-06, + "loss": 0.72456104, + "num_input_tokens_seen": 67126010, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.734375, + "step": 3124, + "time_per_iteration": 3.075070381164551 + }, + { + "auxiliary_loss_clip": 0.01516614, + "auxiliary_loss_mlp": 0.01294482, + "balance_loss_clip": 1.15790129, + "balance_loss_mlp": 1.02268493, + "epoch": 0.37575903324715926, + "flos": 20775432794400.0, + "grad_norm": 2.384153416352183, + "language_loss": 0.87396914, + "learning_rate": 2.871775778296225e-06, + "loss": 0.90208012, + "num_input_tokens_seen": 67143100, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.72070312, + "step": 3125, + "time_per_iteration": 2.9870736598968506 + }, + { + "auxiliary_loss_clip": 0.01520882, + "auxiliary_loss_mlp": 0.01305807, + "balance_loss_clip": 1.16204107, + "balance_loss_mlp": 1.02847826, + "epoch": 0.37587927613779837, + "flos": 18699679687680.0, + "grad_norm": 3.0404293973325696, + "language_loss": 0.78646129, + "learning_rate": 2.8710746369510196e-06, + "loss": 0.81472814, + "num_input_tokens_seen": 67161085, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.77734375, + "step": 3126, + "time_per_iteration": 2.9259839057922363 + }, + { + "auxiliary_loss_clip": 0.01522894, + "auxiliary_loss_mlp": 0.01306331, + "balance_loss_clip": 1.16457176, + "balance_loss_mlp": 1.03110051, + "epoch": 0.3759995190284374, + "flos": 13626434221920.0, + "grad_norm": 2.6303593671254473, + "language_loss": 0.83521318, + "learning_rate": 2.8703733634644846e-06, + "loss": 0.86350548, + "num_input_tokens_seen": 67175840, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.75585938, + "step": 3127, + "time_per_iteration": 2.936100482940674 + }, + { + "auxiliary_loss_clip": 0.01523354, + "auxiliary_loss_mlp": 0.01298158, + "balance_loss_clip": 1.16517651, + "balance_loss_mlp": 1.02559817, + "epoch": 0.37611976191907653, + "flos": 20486910366720.0, + "grad_norm": 1.8390776557198427, + "language_loss": 0.79420412, + "learning_rate": 2.869671957943002e-06, + "loss": 0.82241923, + "num_input_tokens_seen": 67194995, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.72851562, + "step": 3128, + "time_per_iteration": 2.9333722591400146 + }, + { + "auxiliary_loss_clip": 0.01529439, + "auxiliary_loss_mlp": 0.0130159, + "balance_loss_clip": 1.17148948, + "balance_loss_mlp": 1.028458, + "epoch": 0.37624000480971564, + "flos": 21143908512000.0, + "grad_norm": 2.5036682261000993, + "language_loss": 0.74702549, + "learning_rate": 2.8689704204929747e-06, + "loss": 0.77533579, + "num_input_tokens_seen": 67214175, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.734375, + "step": 3129, + "time_per_iteration": 3.0739214420318604 + }, + { + "auxiliary_loss_clip": 0.01524643, + "auxiliary_loss_mlp": 0.01299811, + "balance_loss_clip": 1.16725719, + "balance_loss_mlp": 1.02686954, + "epoch": 0.3763602477003547, + "flos": 22566645930240.0, + "grad_norm": 1.9670058417691578, + "language_loss": 0.81168902, + "learning_rate": 2.8682687512208253e-06, + "loss": 0.83993357, + "num_input_tokens_seen": 67233185, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.73242188, + "step": 3130, + "time_per_iteration": 3.1825971603393555 + }, + { + "auxiliary_loss_clip": 0.01518803, + "auxiliary_loss_mlp": 0.01304644, + "balance_loss_clip": 1.16095781, + "balance_loss_mlp": 1.02769709, + "epoch": 0.3764804905909938, + "flos": 27529329716640.0, + "grad_norm": 2.1841912439141247, + "language_loss": 0.80838692, + "learning_rate": 2.8675669502329972e-06, + "loss": 0.8366214, + "num_input_tokens_seen": 67254715, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.7734375, + "step": 3131, + "time_per_iteration": 2.9532651901245117 + }, + { + "auxiliary_loss_clip": 0.01522271, + "auxiliary_loss_mlp": 0.01302325, + "balance_loss_clip": 1.1633451, + "balance_loss_mlp": 1.03052759, + "epoch": 0.3766007334816329, + "flos": 22530272460480.0, + "grad_norm": 2.740716218116822, + "language_loss": 0.85856682, + "learning_rate": 2.866865017635952e-06, + "loss": 0.88681281, + "num_input_tokens_seen": 67272535, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.72070312, + "step": 3132, + "time_per_iteration": 3.0175650119781494 + }, + { + "auxiliary_loss_clip": 0.01521313, + "auxiliary_loss_mlp": 0.01302795, + "balance_loss_clip": 1.16335487, + "balance_loss_mlp": 1.02794576, + "epoch": 0.376720976372272, + "flos": 25959505436640.0, + "grad_norm": 2.0034828086457916, + "language_loss": 0.79577339, + "learning_rate": 2.866162953536174e-06, + "loss": 0.82401448, + "num_input_tokens_seen": 67293505, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.75195312, + "step": 3133, + "time_per_iteration": 2.9350197315216064 + }, + { + "auxiliary_loss_clip": 0.01529222, + "auxiliary_loss_mlp": 0.01297746, + "balance_loss_clip": 1.17203951, + "balance_loss_mlp": 1.0253768, + "epoch": 0.3768412192629111, + "flos": 18043136680320.0, + "grad_norm": 1.8933251033899041, + "language_loss": 0.74804777, + "learning_rate": 2.8654607580401634e-06, + "loss": 0.77631742, + "num_input_tokens_seen": 67313240, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.7265625, + "step": 3134, + "time_per_iteration": 2.932957649230957 + }, + { + "auxiliary_loss_clip": 0.01604884, + "auxiliary_loss_mlp": 0.01207527, + "balance_loss_clip": 1.24987006, + "balance_loss_mlp": 0.98246002, + "epoch": 0.3769614621535502, + "flos": 62995888677600.0, + "grad_norm": 0.9256905804074514, + "language_loss": 0.65195608, + "learning_rate": 2.8647584312544446e-06, + "loss": 0.68008018, + "num_input_tokens_seen": 67378445, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.2578125, + "step": 3135, + "time_per_iteration": 3.4324023723602295 + }, + { + "auxiliary_loss_clip": 0.01524639, + "auxiliary_loss_mlp": 0.01302853, + "balance_loss_clip": 1.16749895, + "balance_loss_mlp": 1.02838564, + "epoch": 0.37708170504418925, + "flos": 23664070241280.0, + "grad_norm": 3.7620741848312145, + "language_loss": 0.85472143, + "learning_rate": 2.864055973285559e-06, + "loss": 0.88299632, + "num_input_tokens_seen": 67400445, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.74804688, + "step": 3136, + "time_per_iteration": 2.9680562019348145 + }, + { + "auxiliary_loss_clip": 0.01532677, + "auxiliary_loss_mlp": 0.01303701, + "balance_loss_clip": 1.17595553, + "balance_loss_mlp": 1.03247571, + "epoch": 0.37720194793482836, + "flos": 24425333991360.0, + "grad_norm": 1.8216339629605376, + "language_loss": 0.86373746, + "learning_rate": 2.8633533842400698e-06, + "loss": 0.89210129, + "num_input_tokens_seen": 67420645, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.71484375, + "step": 3137, + "time_per_iteration": 2.9925336837768555 + }, + { + "auxiliary_loss_clip": 0.0153168, + "auxiliary_loss_mlp": 0.01310459, + "balance_loss_clip": 1.17379987, + "balance_loss_mlp": 1.03351212, + "epoch": 0.3773221908254674, + "flos": 20998642201920.0, + "grad_norm": 2.3111304533242882, + "language_loss": 0.77409995, + "learning_rate": 2.862650664224558e-06, + "loss": 0.80252141, + "num_input_tokens_seen": 67439495, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.7734375, + "step": 3138, + "time_per_iteration": 3.0320777893066406 + }, + { + "auxiliary_loss_clip": 0.01530804, + "auxiliary_loss_mlp": 0.01315449, + "balance_loss_clip": 1.17409444, + "balance_loss_mlp": 1.04040956, + "epoch": 0.37744243371610653, + "flos": 37634668230240.0, + "grad_norm": 2.2872461697360533, + "language_loss": 0.69717944, + "learning_rate": 2.861947813345627e-06, + "loss": 0.72564197, + "num_input_tokens_seen": 67462195, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.75390625, + "step": 3139, + "time_per_iteration": 3.0780487060546875 + }, + { + "auxiliary_loss_clip": 0.01530666, + "auxiliary_loss_mlp": 0.01309172, + "balance_loss_clip": 1.17381918, + "balance_loss_mlp": 1.03184402, + "epoch": 0.37756267660674564, + "flos": 26142927840000.0, + "grad_norm": 2.7426614638603195, + "language_loss": 0.72386205, + "learning_rate": 2.8612448317098974e-06, + "loss": 0.75226045, + "num_input_tokens_seen": 67482530, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.77734375, + "step": 3140, + "time_per_iteration": 3.0498814582824707 + }, + { + "auxiliary_loss_clip": 0.01526287, + "auxiliary_loss_mlp": 0.01318105, + "balance_loss_clip": 1.16820526, + "balance_loss_mlp": 1.04096746, + "epoch": 0.3776829194973847, + "flos": 19429955766720.0, + "grad_norm": 3.8488631058132974, + "language_loss": 0.82966936, + "learning_rate": 2.8605417194240114e-06, + "loss": 0.85811329, + "num_input_tokens_seen": 67500890, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.77539062, + "step": 3141, + "time_per_iteration": 2.976283311843872 + }, + { + "auxiliary_loss_clip": 0.01522639, + "auxiliary_loss_mlp": 0.01312043, + "balance_loss_clip": 1.16512632, + "balance_loss_mlp": 1.03795671, + "epoch": 0.3778031623880238, + "flos": 17384128342560.0, + "grad_norm": 1.9803755273945087, + "language_loss": 0.7925669, + "learning_rate": 2.8598384765946315e-06, + "loss": 0.82091367, + "num_input_tokens_seen": 67519545, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.74414062, + "step": 3142, + "time_per_iteration": 2.999067783355713 + }, + { + "auxiliary_loss_clip": 0.01518364, + "auxiliary_loss_mlp": 0.0130949, + "balance_loss_clip": 1.16089511, + "balance_loss_mlp": 1.03616714, + "epoch": 0.3779234052786629, + "flos": 27128311273440.0, + "grad_norm": 2.086811556122968, + "language_loss": 0.71568346, + "learning_rate": 2.8591351033284377e-06, + "loss": 0.74396205, + "num_input_tokens_seen": 67539275, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.73632812, + "step": 3143, + "time_per_iteration": 2.961230993270874 + }, + { + "auxiliary_loss_clip": 0.01524046, + "auxiliary_loss_mlp": 0.01310227, + "balance_loss_clip": 1.16723621, + "balance_loss_mlp": 1.03709507, + "epoch": 0.37804364816930197, + "flos": 19684683839520.0, + "grad_norm": 2.2859794909457114, + "language_loss": 0.83373892, + "learning_rate": 2.8584315997321325e-06, + "loss": 0.86208165, + "num_input_tokens_seen": 67558280, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.734375, + "step": 3144, + "time_per_iteration": 3.8285863399505615 + }, + { + "auxiliary_loss_clip": 0.01520085, + "auxiliary_loss_mlp": 0.01309192, + "balance_loss_clip": 1.16356421, + "balance_loss_mlp": 1.03357971, + "epoch": 0.3781638910599411, + "flos": 22704667961760.0, + "grad_norm": 2.889299862294329, + "language_loss": 0.77984929, + "learning_rate": 2.8577279659124356e-06, + "loss": 0.80814207, + "num_input_tokens_seen": 67575955, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.75976562, + "step": 3145, + "time_per_iteration": 3.8414840698242188 + }, + { + "auxiliary_loss_clip": 0.01524137, + "auxiliary_loss_mlp": 0.0129846, + "balance_loss_clip": 1.16776431, + "balance_loss_mlp": 1.02513647, + "epoch": 0.3782841339505802, + "flos": 14649366898080.0, + "grad_norm": 1.9815976023239048, + "language_loss": 0.83332217, + "learning_rate": 2.857024201976089e-06, + "loss": 0.86154819, + "num_input_tokens_seen": 67593515, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.73632812, + "step": 3146, + "time_per_iteration": 2.927398204803467 + }, + { + "auxiliary_loss_clip": 0.015237, + "auxiliary_loss_mlp": 0.01301901, + "balance_loss_clip": 1.16713405, + "balance_loss_mlp": 1.02533531, + "epoch": 0.37840437684121925, + "flos": 32821043569920.0, + "grad_norm": 2.2967430140103824, + "language_loss": 0.72896731, + "learning_rate": 2.8563203080298516e-06, + "loss": 0.75722331, + "num_input_tokens_seen": 67614290, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.76953125, + "step": 3147, + "time_per_iteration": 2.996710777282715 + }, + { + "auxiliary_loss_clip": 0.01522504, + "auxiliary_loss_mlp": 0.01302908, + "balance_loss_clip": 1.1664958, + "balance_loss_mlp": 1.02634215, + "epoch": 0.37852461973185836, + "flos": 18371180615040.0, + "grad_norm": 2.6131215623935553, + "language_loss": 0.89325094, + "learning_rate": 2.855616284180505e-06, + "loss": 0.92150509, + "num_input_tokens_seen": 67631340, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.76953125, + "step": 3148, + "time_per_iteration": 3.808647871017456 + }, + { + "auxiliary_loss_clip": 0.01648166, + "auxiliary_loss_mlp": 0.01303955, + "balance_loss_clip": 1.29703045, + "balance_loss_mlp": 1.07736206, + "epoch": 0.37864486262249747, + "flos": 59507676684000.0, + "grad_norm": 0.9062128217990868, + "language_loss": 0.66011262, + "learning_rate": 2.8549121305348477e-06, + "loss": 0.68963385, + "num_input_tokens_seen": 67691125, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.265625, + "step": 3149, + "time_per_iteration": 3.3990230560302734 + }, + { + "auxiliary_loss_clip": 0.0152389, + "auxiliary_loss_mlp": 0.01293067, + "balance_loss_clip": 1.16765714, + "balance_loss_mlp": 1.02146077, + "epoch": 0.3787651055131365, + "flos": 23365003785120.0, + "grad_norm": 2.9813674909784695, + "language_loss": 0.8361814, + "learning_rate": 2.8542078471997006e-06, + "loss": 0.86435091, + "num_input_tokens_seen": 67708740, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.71875, + "step": 3150, + "time_per_iteration": 3.753660202026367 + }, + { + "auxiliary_loss_clip": 0.01524787, + "auxiliary_loss_mlp": 0.01297627, + "balance_loss_clip": 1.16858649, + "balance_loss_mlp": 1.02525783, + "epoch": 0.37888534840377563, + "flos": 24603332667840.0, + "grad_norm": 2.0214904478392985, + "language_loss": 0.75878394, + "learning_rate": 2.8535034342819013e-06, + "loss": 0.78700805, + "num_input_tokens_seen": 67726150, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.7265625, + "step": 3151, + "time_per_iteration": 3.125098943710327 + }, + { + "auxiliary_loss_clip": 0.01523574, + "auxiliary_loss_mlp": 0.01294539, + "balance_loss_clip": 1.16727591, + "balance_loss_mlp": 1.02121544, + "epoch": 0.37900559129441475, + "flos": 23989117851360.0, + "grad_norm": 1.687059245330432, + "language_loss": 0.72608542, + "learning_rate": 2.85279889188831e-06, + "loss": 0.7542665, + "num_input_tokens_seen": 67746525, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.73632812, + "step": 3152, + "time_per_iteration": 3.0318124294281006 + }, + { + "auxiliary_loss_clip": 0.01527886, + "auxiliary_loss_mlp": 0.01302468, + "balance_loss_clip": 1.17202282, + "balance_loss_mlp": 1.02914548, + "epoch": 0.3791258341850538, + "flos": 24646988344320.0, + "grad_norm": 2.6468393510368635, + "language_loss": 0.81207108, + "learning_rate": 2.852094220125805e-06, + "loss": 0.84037459, + "num_input_tokens_seen": 67766035, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.73632812, + "step": 3153, + "time_per_iteration": 2.933655023574829 + }, + { + "auxiliary_loss_clip": 0.01526401, + "auxiliary_loss_mlp": 0.01309542, + "balance_loss_clip": 1.17015922, + "balance_loss_mlp": 1.03583801, + "epoch": 0.3792460770756929, + "flos": 17422777501920.0, + "grad_norm": 2.8278355790730707, + "language_loss": 0.71007621, + "learning_rate": 2.8513894191012846e-06, + "loss": 0.73843569, + "num_input_tokens_seen": 67785015, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.74023438, + "step": 3154, + "time_per_iteration": 2.9510552883148193 + }, + { + "auxiliary_loss_clip": 0.01529291, + "auxiliary_loss_mlp": 0.01295539, + "balance_loss_clip": 1.17288315, + "balance_loss_mlp": 1.02526736, + "epoch": 0.37936631996633197, + "flos": 24208837868160.0, + "grad_norm": 1.6465055677676175, + "language_loss": 0.79202348, + "learning_rate": 2.8506844889216664e-06, + "loss": 0.82027173, + "num_input_tokens_seen": 67804400, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.70507812, + "step": 3155, + "time_per_iteration": 2.913743257522583 + }, + { + "auxiliary_loss_clip": 0.01639858, + "auxiliary_loss_mlp": 0.01214439, + "balance_loss_clip": 1.29006422, + "balance_loss_mlp": 0.99394989, + "epoch": 0.3794865628569711, + "flos": 70304528332800.0, + "grad_norm": 0.8977287074586172, + "language_loss": 0.62906492, + "learning_rate": 2.849979429693887e-06, + "loss": 0.65760791, + "num_input_tokens_seen": 67865385, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.2109375, + "step": 3156, + "time_per_iteration": 3.5119011402130127 + }, + { + "auxiliary_loss_clip": 0.01524504, + "auxiliary_loss_mlp": 0.01297402, + "balance_loss_clip": 1.16874146, + "balance_loss_mlp": 1.02617681, + "epoch": 0.3796068057476102, + "flos": 15781230342720.0, + "grad_norm": 2.3688947817010346, + "language_loss": 0.74405771, + "learning_rate": 2.8492742415249042e-06, + "loss": 0.77227676, + "num_input_tokens_seen": 67883030, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.71484375, + "step": 3157, + "time_per_iteration": 2.9698548316955566 + }, + { + "auxiliary_loss_clip": 0.0153232, + "auxiliary_loss_mlp": 0.0129238, + "balance_loss_clip": 1.1771419, + "balance_loss_mlp": 1.02077365, + "epoch": 0.37972704863824924, + "flos": 25194031660800.0, + "grad_norm": 1.9777265261837245, + "language_loss": 0.76418531, + "learning_rate": 2.848568924521694e-06, + "loss": 0.79243231, + "num_input_tokens_seen": 67903810, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.71875, + "step": 3158, + "time_per_iteration": 3.0228347778320312 + }, + { + "auxiliary_loss_clip": 0.01529264, + "auxiliary_loss_mlp": 0.01291117, + "balance_loss_clip": 1.17347777, + "balance_loss_mlp": 1.02027392, + "epoch": 0.37984729152888835, + "flos": 26212602598560.0, + "grad_norm": 2.6787642459568604, + "language_loss": 0.73631614, + "learning_rate": 2.8478634787912526e-06, + "loss": 0.76451993, + "num_input_tokens_seen": 67921865, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.7109375, + "step": 3159, + "time_per_iteration": 3.0135858058929443 + }, + { + "auxiliary_loss_clip": 0.01533415, + "auxiliary_loss_mlp": 0.01299646, + "balance_loss_clip": 1.17924976, + "balance_loss_mlp": 1.02823031, + "epoch": 0.37996753441952746, + "flos": 25631651142720.0, + "grad_norm": 2.9261642172778832, + "language_loss": 0.76549345, + "learning_rate": 2.847157904440596e-06, + "loss": 0.79382408, + "num_input_tokens_seen": 67941595, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.71679688, + "step": 3160, + "time_per_iteration": 3.0745348930358887 + }, + { + "auxiliary_loss_clip": 0.01528058, + "auxiliary_loss_mlp": 0.01294912, + "balance_loss_clip": 1.17151093, + "balance_loss_mlp": 1.02330589, + "epoch": 0.3800877773101665, + "flos": 20120368985280.0, + "grad_norm": 1.763967554001355, + "language_loss": 0.73973405, + "learning_rate": 2.846452201576759e-06, + "loss": 0.76796377, + "num_input_tokens_seen": 67960970, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.71875, + "step": 3161, + "time_per_iteration": 3.0241880416870117 + }, + { + "auxiliary_loss_clip": 0.01641799, + "auxiliary_loss_mlp": 0.01255264, + "balance_loss_clip": 1.29112244, + "balance_loss_mlp": 1.02790833, + "epoch": 0.38020802020080563, + "flos": 63059532858720.0, + "grad_norm": 1.193454533988134, + "language_loss": 0.62763572, + "learning_rate": 2.845746370306795e-06, + "loss": 0.65660632, + "num_input_tokens_seen": 68026160, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.2734375, + "step": 3162, + "time_per_iteration": 3.6405532360076904 + }, + { + "auxiliary_loss_clip": 0.01532084, + "auxiliary_loss_mlp": 0.01297574, + "balance_loss_clip": 1.17706692, + "balance_loss_mlp": 1.02329707, + "epoch": 0.38032826309144474, + "flos": 21290578164000.0, + "grad_norm": 2.2542617767523594, + "language_loss": 0.78370678, + "learning_rate": 2.84504041073778e-06, + "loss": 0.81200337, + "num_input_tokens_seen": 68044575, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.74609375, + "step": 3163, + "time_per_iteration": 3.035921096801758 + }, + { + "auxiliary_loss_clip": 0.01534977, + "auxiliary_loss_mlp": 0.01314953, + "balance_loss_clip": 1.18035853, + "balance_loss_mlp": 1.04239285, + "epoch": 0.3804485059820838, + "flos": 18956190384000.0, + "grad_norm": 1.973933491653778, + "language_loss": 0.79424047, + "learning_rate": 2.844334322976806e-06, + "loss": 0.82273972, + "num_input_tokens_seen": 68064790, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.72851562, + "step": 3164, + "time_per_iteration": 3.056521415710449 + }, + { + "auxiliary_loss_clip": 0.01528659, + "auxiliary_loss_mlp": 0.01322743, + "balance_loss_clip": 1.17374599, + "balance_loss_mlp": 1.04579592, + "epoch": 0.3805687488727229, + "flos": 21835838856960.0, + "grad_norm": 1.9983924964975879, + "language_loss": 0.83556557, + "learning_rate": 2.8436281071309866e-06, + "loss": 0.86407959, + "num_input_tokens_seen": 68083330, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.7734375, + "step": 3165, + "time_per_iteration": 3.0182478427886963 + }, + { + "auxiliary_loss_clip": 0.01634579, + "auxiliary_loss_mlp": 0.01309456, + "balance_loss_clip": 1.28417039, + "balance_loss_mlp": 1.0798111, + "epoch": 0.380688991763362, + "flos": 58552825783680.0, + "grad_norm": 0.7752777135094272, + "language_loss": 0.52958971, + "learning_rate": 2.842921763307455e-06, + "loss": 0.55903006, + "num_input_tokens_seen": 68146140, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.2890625, + "step": 3166, + "time_per_iteration": 3.3888185024261475 + }, + { + "auxiliary_loss_clip": 0.01530333, + "auxiliary_loss_mlp": 0.01297484, + "balance_loss_clip": 1.17558575, + "balance_loss_mlp": 1.02416122, + "epoch": 0.38080923465400107, + "flos": 23801447494080.0, + "grad_norm": 2.4006702332867684, + "language_loss": 0.82346004, + "learning_rate": 2.842215291613361e-06, + "loss": 0.85173821, + "num_input_tokens_seen": 68164520, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.73632812, + "step": 3167, + "time_per_iteration": 3.002018690109253 + }, + { + "auxiliary_loss_clip": 0.01631141, + "auxiliary_loss_mlp": 0.01241386, + "balance_loss_clip": 1.28010464, + "balance_loss_mlp": 1.01784515, + "epoch": 0.3809294775446402, + "flos": 54976240448640.0, + "grad_norm": 0.8449565229737059, + "language_loss": 0.59153897, + "learning_rate": 2.8415086921558774e-06, + "loss": 0.62026423, + "num_input_tokens_seen": 68227945, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 2.2421875, + "step": 3168, + "time_per_iteration": 3.391340732574463 + }, + { + "auxiliary_loss_clip": 0.01525953, + "auxiliary_loss_mlp": 0.01295836, + "balance_loss_clip": 1.17068815, + "balance_loss_mlp": 1.02537394, + "epoch": 0.38104972043527924, + "flos": 24645964284000.0, + "grad_norm": 1.6890352108616857, + "language_loss": 0.79138875, + "learning_rate": 2.840801965042194e-06, + "loss": 0.8196066, + "num_input_tokens_seen": 68247405, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.70703125, + "step": 3169, + "time_per_iteration": 2.9291632175445557 + }, + { + "auxiliary_loss_clip": 0.01537967, + "auxiliary_loss_mlp": 0.01298063, + "balance_loss_clip": 1.18403757, + "balance_loss_mlp": 1.02588463, + "epoch": 0.38116996332591835, + "flos": 22858999102080.0, + "grad_norm": 2.0456330312732045, + "language_loss": 0.84408796, + "learning_rate": 2.840095110379521e-06, + "loss": 0.87244827, + "num_input_tokens_seen": 68266925, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.72460938, + "step": 3170, + "time_per_iteration": 2.961839199066162 + }, + { + "auxiliary_loss_clip": 0.01620179, + "auxiliary_loss_mlp": 0.01215042, + "balance_loss_clip": 1.26924825, + "balance_loss_mlp": 0.99455261, + "epoch": 0.38129020621655746, + "flos": 60842420042400.0, + "grad_norm": 0.7468881960714026, + "language_loss": 0.53832698, + "learning_rate": 2.8393881282750884e-06, + "loss": 0.56667912, + "num_input_tokens_seen": 68329755, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.2109375, + "step": 3171, + "time_per_iteration": 4.122281312942505 + }, + { + "auxiliary_loss_clip": 0.01528277, + "auxiliary_loss_mlp": 0.01300973, + "balance_loss_clip": 1.17366695, + "balance_loss_mlp": 1.02860379, + "epoch": 0.3814104491071965, + "flos": 21649989051360.0, + "grad_norm": 2.005139037534678, + "language_loss": 0.7910465, + "learning_rate": 2.838681018836144e-06, + "loss": 0.81933898, + "num_input_tokens_seen": 68347075, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.7265625, + "step": 3172, + "time_per_iteration": 3.7846691608428955 + }, + { + "auxiliary_loss_clip": 0.01532641, + "auxiliary_loss_mlp": 0.01301328, + "balance_loss_clip": 1.17683756, + "balance_loss_mlp": 1.03029323, + "epoch": 0.3815306919978356, + "flos": 19101115340640.0, + "grad_norm": 2.4136840088430698, + "language_loss": 0.78406137, + "learning_rate": 2.837973782169955e-06, + "loss": 0.81240106, + "num_input_tokens_seen": 68365450, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.71289062, + "step": 3173, + "time_per_iteration": 2.9651474952697754 + }, + { + "auxiliary_loss_clip": 0.0161163, + "auxiliary_loss_mlp": 0.01219406, + "balance_loss_clip": 1.25946236, + "balance_loss_mlp": 0.99891663, + "epoch": 0.38165093488847474, + "flos": 67074116957280.0, + "grad_norm": 0.8191005881590949, + "language_loss": 0.59097314, + "learning_rate": 2.8372664183838096e-06, + "loss": 0.6192835, + "num_input_tokens_seen": 68428470, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.2109375, + "step": 3174, + "time_per_iteration": 3.4547834396362305 + }, + { + "auxiliary_loss_clip": 0.01525977, + "auxiliary_loss_mlp": 0.01299634, + "balance_loss_clip": 1.16893339, + "balance_loss_mlp": 1.02840924, + "epoch": 0.3817711777791138, + "flos": 22343019312960.0, + "grad_norm": 2.3536456972010753, + "language_loss": 0.6850391, + "learning_rate": 2.836558927585015e-06, + "loss": 0.71329522, + "num_input_tokens_seen": 68445440, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.71484375, + "step": 3175, + "time_per_iteration": 3.79323410987854 + }, + { + "auxiliary_loss_clip": 0.01518584, + "auxiliary_loss_mlp": 0.01306881, + "balance_loss_clip": 1.16276813, + "balance_loss_mlp": 1.03355789, + "epoch": 0.3818914206697529, + "flos": 22822853201280.0, + "grad_norm": 2.302473036736703, + "language_loss": 0.82324368, + "learning_rate": 2.8358513098808957e-06, + "loss": 0.85149837, + "num_input_tokens_seen": 68465755, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.73632812, + "step": 3176, + "time_per_iteration": 3.079014301300049 + }, + { + "auxiliary_loss_clip": 0.01523648, + "auxiliary_loss_mlp": 0.01292893, + "balance_loss_clip": 1.16597033, + "balance_loss_mlp": 1.01880717, + "epoch": 0.382011663560392, + "flos": 24388201958400.0, + "grad_norm": 2.0187882233052084, + "language_loss": 0.77074099, + "learning_rate": 2.835143565378798e-06, + "loss": 0.79890645, + "num_input_tokens_seen": 68486220, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.74414062, + "step": 3177, + "time_per_iteration": 3.040952444076538 + }, + { + "auxiliary_loss_clip": 0.01524569, + "auxiliary_loss_mlp": 0.0130005, + "balance_loss_clip": 1.1675508, + "balance_loss_mlp": 1.0236752, + "epoch": 0.38213190645103107, + "flos": 21983760138240.0, + "grad_norm": 2.585657627657572, + "language_loss": 0.7899394, + "learning_rate": 2.8344356941860847e-06, + "loss": 0.81818557, + "num_input_tokens_seen": 68505850, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.76757812, + "step": 3178, + "time_per_iteration": 3.903578519821167 + }, + { + "auxiliary_loss_clip": 0.01529102, + "auxiliary_loss_mlp": 0.01299903, + "balance_loss_clip": 1.17308128, + "balance_loss_mlp": 1.02848709, + "epoch": 0.3822521493416702, + "flos": 35519848754400.0, + "grad_norm": 2.4208342204779707, + "language_loss": 0.66142768, + "learning_rate": 2.8337276964101403e-06, + "loss": 0.68971771, + "num_input_tokens_seen": 68526290, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.71679688, + "step": 3179, + "time_per_iteration": 3.1816442012786865 + }, + { + "auxiliary_loss_clip": 0.01519369, + "auxiliary_loss_mlp": 0.01301137, + "balance_loss_clip": 1.16276217, + "balance_loss_mlp": 1.02743268, + "epoch": 0.3823723922323093, + "flos": 21072071848320.0, + "grad_norm": 2.320678530787887, + "language_loss": 0.76454943, + "learning_rate": 2.833019572158367e-06, + "loss": 0.79275453, + "num_input_tokens_seen": 68544725, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.74023438, + "step": 3180, + "time_per_iteration": 3.147864580154419 + }, + { + "auxiliary_loss_clip": 0.01527042, + "auxiliary_loss_mlp": 0.01293629, + "balance_loss_clip": 1.17036271, + "balance_loss_mlp": 1.0206871, + "epoch": 0.38249263512294834, + "flos": 19791035493120.0, + "grad_norm": 2.0941039280777742, + "language_loss": 0.80424595, + "learning_rate": 2.8323113215381872e-06, + "loss": 0.8324526, + "num_input_tokens_seen": 68563070, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.73242188, + "step": 3181, + "time_per_iteration": 3.015306234359741 + }, + { + "auxiliary_loss_clip": 0.01526245, + "auxiliary_loss_mlp": 0.01326067, + "balance_loss_clip": 1.16972065, + "balance_loss_mlp": 1.04683113, + "epoch": 0.38261287801358745, + "flos": 21436072043040.0, + "grad_norm": 2.610299019695728, + "language_loss": 0.76226568, + "learning_rate": 2.831602944657042e-06, + "loss": 0.79078883, + "num_input_tokens_seen": 68581150, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.796875, + "step": 3182, + "time_per_iteration": 3.0671675205230713 + }, + { + "auxiliary_loss_clip": 0.01528825, + "auxiliary_loss_mlp": 0.01301023, + "balance_loss_clip": 1.17209268, + "balance_loss_mlp": 1.02827263, + "epoch": 0.38273312090422656, + "flos": 21983760138240.0, + "grad_norm": 3.1242382493728402, + "language_loss": 0.743976, + "learning_rate": 2.830894441622391e-06, + "loss": 0.77227449, + "num_input_tokens_seen": 68597800, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.73046875, + "step": 3183, + "time_per_iteration": 2.9644668102264404 + }, + { + "auxiliary_loss_clip": 0.01532295, + "auxiliary_loss_mlp": 0.01299355, + "balance_loss_clip": 1.17577481, + "balance_loss_mlp": 1.02679467, + "epoch": 0.3828533637948656, + "flos": 24793240786560.0, + "grad_norm": 8.267763814215442, + "language_loss": 0.80095458, + "learning_rate": 2.8301858125417134e-06, + "loss": 0.82927108, + "num_input_tokens_seen": 68617640, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.72851562, + "step": 3184, + "time_per_iteration": 2.9861767292022705 + }, + { + "auxiliary_loss_clip": 0.01528066, + "auxiliary_loss_mlp": 0.01306653, + "balance_loss_clip": 1.172822, + "balance_loss_mlp": 1.03409314, + "epoch": 0.38297360668550473, + "flos": 22457639305440.0, + "grad_norm": 2.3568720857003504, + "language_loss": 0.74043846, + "learning_rate": 2.8294770575225082e-06, + "loss": 0.76878572, + "num_input_tokens_seen": 68637770, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.72851562, + "step": 3185, + "time_per_iteration": 3.0682218074798584 + }, + { + "auxiliary_loss_clip": 0.01536811, + "auxiliary_loss_mlp": 0.01300324, + "balance_loss_clip": 1.18168175, + "balance_loss_mlp": 1.02833617, + "epoch": 0.3830938495761438, + "flos": 24899061445920.0, + "grad_norm": 1.7927590738690415, + "language_loss": 0.83964664, + "learning_rate": 2.828768176672293e-06, + "loss": 0.86801803, + "num_input_tokens_seen": 68656885, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.72265625, + "step": 3186, + "time_per_iteration": 2.9611706733703613 + }, + { + "auxiliary_loss_clip": 0.01529361, + "auxiliary_loss_mlp": 0.01308868, + "balance_loss_clip": 1.17383671, + "balance_loss_mlp": 1.0336374, + "epoch": 0.3832140924667829, + "flos": 33039018891360.0, + "grad_norm": 1.7069482539607141, + "language_loss": 0.71724921, + "learning_rate": 2.8280591700986044e-06, + "loss": 0.74563152, + "num_input_tokens_seen": 68678750, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.75585938, + "step": 3187, + "time_per_iteration": 3.059464931488037 + }, + { + "auxiliary_loss_clip": 0.01529051, + "auxiliary_loss_mlp": 0.0130462, + "balance_loss_clip": 1.17317295, + "balance_loss_mlp": 1.03110623, + "epoch": 0.383334335357422, + "flos": 31906358955360.0, + "grad_norm": 2.055020647669032, + "language_loss": 0.7510308, + "learning_rate": 2.827350037908999e-06, + "loss": 0.77936745, + "num_input_tokens_seen": 68698190, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.73828125, + "step": 3188, + "time_per_iteration": 3.1555933952331543 + }, + { + "auxiliary_loss_clip": 0.01534269, + "auxiliary_loss_mlp": 0.01310747, + "balance_loss_clip": 1.17809105, + "balance_loss_mlp": 1.03494453, + "epoch": 0.38345457824806106, + "flos": 19793955961440.0, + "grad_norm": 2.33482002044412, + "language_loss": 0.79376721, + "learning_rate": 2.8266407802110496e-06, + "loss": 0.82221735, + "num_input_tokens_seen": 68716445, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.76171875, + "step": 3189, + "time_per_iteration": 3.0885262489318848 + }, + { + "auxiliary_loss_clip": 0.01532092, + "auxiliary_loss_mlp": 0.01297994, + "balance_loss_clip": 1.17645442, + "balance_loss_mlp": 1.02371716, + "epoch": 0.3835748211387002, + "flos": 22421607189120.0, + "grad_norm": 2.1878621182648272, + "language_loss": 0.75667715, + "learning_rate": 2.8259313971123515e-06, + "loss": 0.78497803, + "num_input_tokens_seen": 68737565, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.74609375, + "step": 3190, + "time_per_iteration": 3.280099391937256 + }, + { + "auxiliary_loss_clip": 0.01531259, + "auxiliary_loss_mlp": 0.0128809, + "balance_loss_clip": 1.17422485, + "balance_loss_mlp": 1.01762855, + "epoch": 0.3836950640293393, + "flos": 25120753727040.0, + "grad_norm": 1.9072519710111182, + "language_loss": 0.78456283, + "learning_rate": 2.8252218887205166e-06, + "loss": 0.8127563, + "num_input_tokens_seen": 68758255, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.70703125, + "step": 3191, + "time_per_iteration": 3.005237579345703 + }, + { + "auxiliary_loss_clip": 0.0153391, + "auxiliary_loss_mlp": 0.01299577, + "balance_loss_clip": 1.17850399, + "balance_loss_mlp": 1.02739871, + "epoch": 0.38381530691997834, + "flos": 21801399723360.0, + "grad_norm": 1.7353874349416492, + "language_loss": 0.80610335, + "learning_rate": 2.824512255143178e-06, + "loss": 0.8344382, + "num_input_tokens_seen": 68777490, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.72460938, + "step": 3192, + "time_per_iteration": 3.023345708847046 + }, + { + "auxiliary_loss_clip": 0.0153554, + "auxiliary_loss_mlp": 0.0129463, + "balance_loss_clip": 1.17951703, + "balance_loss_mlp": 1.02130735, + "epoch": 0.38393554981061745, + "flos": 21254887401120.0, + "grad_norm": 1.9479113662304972, + "language_loss": 0.79087669, + "learning_rate": 2.8238024964879855e-06, + "loss": 0.81917834, + "num_input_tokens_seen": 68798385, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.73632812, + "step": 3193, + "time_per_iteration": 3.0540883541107178 + }, + { + "auxiliary_loss_clip": 0.01541069, + "auxiliary_loss_mlp": 0.01302622, + "balance_loss_clip": 1.18606937, + "balance_loss_mlp": 1.02872658, + "epoch": 0.38405579270125656, + "flos": 17021265992640.0, + "grad_norm": 5.61141940047379, + "language_loss": 0.76678294, + "learning_rate": 2.8230926128626095e-06, + "loss": 0.7952199, + "num_input_tokens_seen": 68816880, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.7421875, + "step": 3194, + "time_per_iteration": 3.034133195877075 + }, + { + "auxiliary_loss_clip": 0.0153541, + "auxiliary_loss_mlp": 0.01311817, + "balance_loss_clip": 1.17831397, + "balance_loss_mlp": 1.03868437, + "epoch": 0.3841760355918956, + "flos": 21837545624160.0, + "grad_norm": 2.093663461482812, + "language_loss": 0.7925027, + "learning_rate": 2.822382604374738e-06, + "loss": 0.82097495, + "num_input_tokens_seen": 68835805, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.734375, + "step": 3195, + "time_per_iteration": 3.084451913833618 + }, + { + "auxiliary_loss_clip": 0.01546071, + "auxiliary_loss_mlp": 0.01296713, + "balance_loss_clip": 1.19168186, + "balance_loss_mlp": 1.02682304, + "epoch": 0.3842962784825347, + "flos": 25917139317600.0, + "grad_norm": 10.744583036644585, + "language_loss": 0.65670019, + "learning_rate": 2.8216724711320793e-06, + "loss": 0.68512797, + "num_input_tokens_seen": 68854930, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.70117188, + "step": 3196, + "time_per_iteration": 3.028043746948242 + }, + { + "auxiliary_loss_clip": 0.01541836, + "auxiliary_loss_mlp": 0.01291816, + "balance_loss_clip": 1.18584347, + "balance_loss_mlp": 1.02211654, + "epoch": 0.38441652137317384, + "flos": 25339828965120.0, + "grad_norm": 1.732068608357238, + "language_loss": 0.79752773, + "learning_rate": 2.820962213242361e-06, + "loss": 0.82586426, + "num_input_tokens_seen": 68874260, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.69921875, + "step": 3197, + "time_per_iteration": 2.9706227779388428 + }, + { + "auxiliary_loss_clip": 0.01549703, + "auxiliary_loss_mlp": 0.0129722, + "balance_loss_clip": 1.19579458, + "balance_loss_mlp": 1.02694893, + "epoch": 0.3845367642638129, + "flos": 18115428481920.0, + "grad_norm": 3.0099603753012563, + "language_loss": 0.84314609, + "learning_rate": 2.8202518308133264e-06, + "loss": 0.87161529, + "num_input_tokens_seen": 68891535, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.70507812, + "step": 3198, + "time_per_iteration": 2.961167097091675 + }, + { + "auxiliary_loss_clip": 0.01541214, + "auxiliary_loss_mlp": 0.01300382, + "balance_loss_clip": 1.18581629, + "balance_loss_mlp": 1.02572405, + "epoch": 0.384657007154452, + "flos": 25230746484000.0, + "grad_norm": 2.1833222975264115, + "language_loss": 0.73514879, + "learning_rate": 2.8195413239527426e-06, + "loss": 0.76356471, + "num_input_tokens_seen": 68911275, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.75, + "step": 3199, + "time_per_iteration": 3.845815420150757 + }, + { + "auxiliary_loss_clip": 0.01541456, + "auxiliary_loss_mlp": 0.01291013, + "balance_loss_clip": 1.1865921, + "balance_loss_mlp": 1.01921582, + "epoch": 0.38477725004509106, + "flos": 19867954530240.0, + "grad_norm": 2.2331750413983835, + "language_loss": 0.80622017, + "learning_rate": 2.8188306927683906e-06, + "loss": 0.8345449, + "num_input_tokens_seen": 68930745, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.72070312, + "step": 3200, + "time_per_iteration": 3.9316599369049072 + }, + { + "auxiliary_loss_clip": 0.01553644, + "auxiliary_loss_mlp": 0.01291836, + "balance_loss_clip": 1.20153546, + "balance_loss_mlp": 1.02042043, + "epoch": 0.38489749293573017, + "flos": 18261339570720.0, + "grad_norm": 3.539876985060959, + "language_loss": 0.74195623, + "learning_rate": 2.818119937368074e-06, + "loss": 0.77041101, + "num_input_tokens_seen": 68949380, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.71679688, + "step": 3201, + "time_per_iteration": 3.041470766067505 + }, + { + "auxiliary_loss_clip": 0.01552205, + "auxiliary_loss_mlp": 0.01300904, + "balance_loss_clip": 1.19916582, + "balance_loss_mlp": 1.02395725, + "epoch": 0.3850177358263693, + "flos": 24391463780160.0, + "grad_norm": 1.9626170803363217, + "language_loss": 0.65761137, + "learning_rate": 2.817409057859613e-06, + "loss": 0.68614244, + "num_input_tokens_seen": 68968370, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.7734375, + "step": 3202, + "time_per_iteration": 2.9744575023651123 + }, + { + "auxiliary_loss_clip": 0.01551598, + "auxiliary_loss_mlp": 0.01301592, + "balance_loss_clip": 1.19837499, + "balance_loss_mlp": 1.02807856, + "epoch": 0.38513797871700833, + "flos": 17673219692640.0, + "grad_norm": 2.620168376538761, + "language_loss": 0.79341739, + "learning_rate": 2.8166980543508482e-06, + "loss": 0.82194924, + "num_input_tokens_seen": 68984260, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.73828125, + "step": 3203, + "time_per_iteration": 3.7707414627075195 + }, + { + "auxiliary_loss_clip": 0.01555677, + "auxiliary_loss_mlp": 0.01303647, + "balance_loss_clip": 1.20298302, + "balance_loss_mlp": 1.02879858, + "epoch": 0.38525822160764744, + "flos": 25741985253120.0, + "grad_norm": 2.211855298537223, + "language_loss": 0.79983324, + "learning_rate": 2.815986926949638e-06, + "loss": 0.82842648, + "num_input_tokens_seen": 69002760, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.75195312, + "step": 3204, + "time_per_iteration": 3.002774238586426 + }, + { + "auxiliary_loss_clip": 0.01553112, + "auxiliary_loss_mlp": 0.01291784, + "balance_loss_clip": 1.19983602, + "balance_loss_mlp": 1.01960492, + "epoch": 0.38537846449828655, + "flos": 20195732967840.0, + "grad_norm": 1.8137219357760106, + "language_loss": 0.80574608, + "learning_rate": 2.8152756757638597e-06, + "loss": 0.83419502, + "num_input_tokens_seen": 69021260, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.72460938, + "step": 3205, + "time_per_iteration": 2.97580885887146 + }, + { + "auxiliary_loss_clip": 0.01555829, + "auxiliary_loss_mlp": 0.01302962, + "balance_loss_clip": 1.20300698, + "balance_loss_mlp": 1.03059256, + "epoch": 0.3854987073889256, + "flos": 23041587085920.0, + "grad_norm": 2.0694683942127323, + "language_loss": 0.84858489, + "learning_rate": 2.8145643009014093e-06, + "loss": 0.87717283, + "num_input_tokens_seen": 69039755, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.7265625, + "step": 3206, + "time_per_iteration": 3.8462445735931396 + }, + { + "auxiliary_loss_clip": 0.0156009, + "auxiliary_loss_mlp": 0.01298752, + "balance_loss_clip": 1.20673287, + "balance_loss_mlp": 1.02848053, + "epoch": 0.3856189502795647, + "flos": 20192547002400.0, + "grad_norm": 1.9117452310418257, + "language_loss": 0.79548186, + "learning_rate": 2.813852802470202e-06, + "loss": 0.82407033, + "num_input_tokens_seen": 69057650, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.70507812, + "step": 3207, + "time_per_iteration": 3.0600037574768066 + }, + { + "auxiliary_loss_clip": 0.01555827, + "auxiliary_loss_mlp": 0.01300751, + "balance_loss_clip": 1.20361924, + "balance_loss_mlp": 1.02513933, + "epoch": 0.38573919317020383, + "flos": 25705004932800.0, + "grad_norm": 2.181688755756029, + "language_loss": 0.72442752, + "learning_rate": 2.8131411805781717e-06, + "loss": 0.75299329, + "num_input_tokens_seen": 69077775, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.75976562, + "step": 3208, + "time_per_iteration": 2.95858097076416 + }, + { + "auxiliary_loss_clip": 0.01559735, + "auxiliary_loss_mlp": 0.01307413, + "balance_loss_clip": 1.20749724, + "balance_loss_mlp": 1.03065681, + "epoch": 0.3858594360608429, + "flos": 29823778779840.0, + "grad_norm": 2.5254605327760005, + "language_loss": 0.6463784, + "learning_rate": 2.8124294353332707e-06, + "loss": 0.67504984, + "num_input_tokens_seen": 69096450, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.77148438, + "step": 3209, + "time_per_iteration": 3.019810199737549 + }, + { + "auxiliary_loss_clip": 0.01556305, + "auxiliary_loss_mlp": 0.01300475, + "balance_loss_clip": 1.20264852, + "balance_loss_mlp": 1.02753329, + "epoch": 0.385979678951482, + "flos": 24792747720480.0, + "grad_norm": 2.1249470692184573, + "language_loss": 0.77542239, + "learning_rate": 2.8117175668434713e-06, + "loss": 0.80399019, + "num_input_tokens_seen": 69116110, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.73242188, + "step": 3210, + "time_per_iteration": 3.004470109939575 + }, + { + "auxiliary_loss_clip": 0.01563738, + "auxiliary_loss_mlp": 0.0130984, + "balance_loss_clip": 1.21024263, + "balance_loss_mlp": 1.0365169, + "epoch": 0.3860999218421211, + "flos": 21290009241600.0, + "grad_norm": 3.4457108731977217, + "language_loss": 0.69840705, + "learning_rate": 2.811005575216762e-06, + "loss": 0.72714281, + "num_input_tokens_seen": 69134825, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.73632812, + "step": 3211, + "time_per_iteration": 2.931905746459961 + }, + { + "auxiliary_loss_clip": 0.01553113, + "auxiliary_loss_mlp": 0.01299893, + "balance_loss_clip": 1.20127106, + "balance_loss_mlp": 1.02504468, + "epoch": 0.38622016473276016, + "flos": 24539119564320.0, + "grad_norm": 1.4707023147525042, + "language_loss": 0.78789932, + "learning_rate": 2.8102934605611513e-06, + "loss": 0.81642938, + "num_input_tokens_seen": 69156460, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.75195312, + "step": 3212, + "time_per_iteration": 3.071072816848755 + }, + { + "auxiliary_loss_clip": 0.01559415, + "auxiliary_loss_mlp": 0.01299396, + "balance_loss_clip": 1.20752287, + "balance_loss_mlp": 1.02855229, + "epoch": 0.3863404076233993, + "flos": 20560302084960.0, + "grad_norm": 2.5369302281087442, + "language_loss": 0.67516196, + "learning_rate": 2.8095812229846665e-06, + "loss": 0.70375013, + "num_input_tokens_seen": 69176420, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.7109375, + "step": 3213, + "time_per_iteration": 2.9153754711151123 + }, + { + "auxiliary_loss_clip": 0.0155027, + "auxiliary_loss_mlp": 0.01301095, + "balance_loss_clip": 1.19705868, + "balance_loss_mlp": 1.02719927, + "epoch": 0.3864606505140384, + "flos": 22348480968000.0, + "grad_norm": 2.432249860821671, + "language_loss": 0.69039309, + "learning_rate": 2.808868862595355e-06, + "loss": 0.7189067, + "num_input_tokens_seen": 69196665, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.7421875, + "step": 3214, + "time_per_iteration": 2.925403594970703 + }, + { + "auxiliary_loss_clip": 0.01557904, + "auxiliary_loss_mlp": 0.01292182, + "balance_loss_clip": 1.20652401, + "balance_loss_mlp": 1.02095699, + "epoch": 0.38658089340467744, + "flos": 25706028993120.0, + "grad_norm": 2.727173580069898, + "language_loss": 0.7982803, + "learning_rate": 2.8081563795012795e-06, + "loss": 0.82678115, + "num_input_tokens_seen": 69216290, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.71484375, + "step": 3215, + "time_per_iteration": 2.9642722606658936 + }, + { + "auxiliary_loss_clip": 0.01554953, + "auxiliary_loss_mlp": 0.01296268, + "balance_loss_clip": 1.20250678, + "balance_loss_mlp": 1.0240891, + "epoch": 0.38670113629531655, + "flos": 33805365014880.0, + "grad_norm": 2.005008207090594, + "language_loss": 0.73843539, + "learning_rate": 2.807443773810524e-06, + "loss": 0.76694763, + "num_input_tokens_seen": 69237550, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.72460938, + "step": 3216, + "time_per_iteration": 3.003484010696411 + }, + { + "auxiliary_loss_clip": 0.01564998, + "auxiliary_loss_mlp": 0.01306347, + "balance_loss_clip": 1.21370697, + "balance_loss_mlp": 1.03416884, + "epoch": 0.3868213791859556, + "flos": 23333788545120.0, + "grad_norm": 3.1838798463887708, + "language_loss": 0.89397478, + "learning_rate": 2.80673104563119e-06, + "loss": 0.92268825, + "num_input_tokens_seen": 69258175, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.72460938, + "step": 3217, + "time_per_iteration": 2.989598035812378 + }, + { + "auxiliary_loss_clip": 0.01567311, + "auxiliary_loss_mlp": 0.01292859, + "balance_loss_clip": 1.21627045, + "balance_loss_mlp": 1.02373207, + "epoch": 0.3869416220765947, + "flos": 18443472416640.0, + "grad_norm": 1.9693102340819617, + "language_loss": 0.79163766, + "learning_rate": 2.8060181950713976e-06, + "loss": 0.82023937, + "num_input_tokens_seen": 69274965, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.69335938, + "step": 3218, + "time_per_iteration": 2.9914627075195312 + }, + { + "auxiliary_loss_clip": 0.01566527, + "auxiliary_loss_mlp": 0.01300942, + "balance_loss_clip": 1.21528673, + "balance_loss_mlp": 1.02685642, + "epoch": 0.3870618649672338, + "flos": 15634826187840.0, + "grad_norm": 3.7034078778380004, + "language_loss": 0.8095417, + "learning_rate": 2.805305222239286e-06, + "loss": 0.83821642, + "num_input_tokens_seen": 69292220, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.74414062, + "step": 3219, + "time_per_iteration": 3.2001054286956787 + }, + { + "auxiliary_loss_clip": 0.01575791, + "auxiliary_loss_mlp": 0.01294113, + "balance_loss_clip": 1.22593093, + "balance_loss_mlp": 1.02212489, + "epoch": 0.3871821078578729, + "flos": 23516073103680.0, + "grad_norm": 2.0730799861879077, + "language_loss": 0.74119413, + "learning_rate": 2.8045921272430118e-06, + "loss": 0.76989317, + "num_input_tokens_seen": 69311900, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.72265625, + "step": 3220, + "time_per_iteration": 3.1480116844177246 + }, + { + "auxiliary_loss_clip": 0.015677, + "auxiliary_loss_mlp": 0.01306077, + "balance_loss_clip": 1.21642292, + "balance_loss_mlp": 1.02970195, + "epoch": 0.387302350748512, + "flos": 17779798915200.0, + "grad_norm": 3.2316458632782745, + "language_loss": 0.76658851, + "learning_rate": 2.803878910190753e-06, + "loss": 0.79532623, + "num_input_tokens_seen": 69328820, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.76757812, + "step": 3221, + "time_per_iteration": 3.117819309234619 + }, + { + "auxiliary_loss_clip": 0.01561111, + "auxiliary_loss_mlp": 0.01296048, + "balance_loss_clip": 1.21118665, + "balance_loss_mlp": 1.02367902, + "epoch": 0.3874225936391511, + "flos": 11504635964640.0, + "grad_norm": 2.5338636273834116, + "language_loss": 0.81822014, + "learning_rate": 2.8031655711907017e-06, + "loss": 0.84679174, + "num_input_tokens_seen": 69342525, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 2.7265625, + "step": 3222, + "time_per_iteration": 3.1504058837890625 + }, + { + "auxiliary_loss_clip": 0.0156143, + "auxiliary_loss_mlp": 0.01301143, + "balance_loss_clip": 1.21105158, + "balance_loss_mlp": 1.02839208, + "epoch": 0.38754283652979016, + "flos": 21947348740320.0, + "grad_norm": 3.3028203333794597, + "language_loss": 0.80576992, + "learning_rate": 2.8024521103510723e-06, + "loss": 0.83439571, + "num_input_tokens_seen": 69359295, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 2.73046875, + "step": 3223, + "time_per_iteration": 3.1117544174194336 + }, + { + "auxiliary_loss_clip": 0.01566878, + "auxiliary_loss_mlp": 0.01302601, + "balance_loss_clip": 1.21563983, + "balance_loss_mlp": 1.03099442, + "epoch": 0.38766307942042927, + "flos": 21177968364000.0, + "grad_norm": 3.1380016478205137, + "language_loss": 0.75683129, + "learning_rate": 2.8017385277800952e-06, + "loss": 0.7855261, + "num_input_tokens_seen": 69377650, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.71875, + "step": 3224, + "time_per_iteration": 2.9972550868988037 + }, + { + "auxiliary_loss_clip": 0.01571722, + "auxiliary_loss_mlp": 0.01306455, + "balance_loss_clip": 1.22211921, + "balance_loss_mlp": 1.02988935, + "epoch": 0.3877833223110684, + "flos": 27419754169440.0, + "grad_norm": 2.628153453141439, + "language_loss": 0.75123942, + "learning_rate": 2.8010248235860213e-06, + "loss": 0.78002119, + "num_input_tokens_seen": 69397765, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 2.76953125, + "step": 3225, + "time_per_iteration": 3.086151361465454 + }, + { + "auxiliary_loss_clip": 0.01824334, + "auxiliary_loss_mlp": 0.01261986, + "balance_loss_clip": 1.48608375, + "balance_loss_mlp": 1.04149628, + "epoch": 0.38790356520170743, + "flos": 64507340790720.0, + "grad_norm": 0.8473326326975825, + "language_loss": 0.62837154, + "learning_rate": 2.8003109978771192e-06, + "loss": 0.65923476, + "num_input_tokens_seen": 69458930, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 2.2109375, + "step": 3226, + "time_per_iteration": 4.288877248764038 + }, + { + "auxiliary_loss_clip": 0.01564653, + "auxiliary_loss_mlp": 0.01298619, + "balance_loss_clip": 1.21373439, + "balance_loss_mlp": 1.02701283, + "epoch": 0.38802380809234654, + "flos": 22347570692160.0, + "grad_norm": 1.9029959261107339, + "language_loss": 0.78837252, + "learning_rate": 2.799597050761674e-06, + "loss": 0.81700528, + "num_input_tokens_seen": 69475135, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 2.71875, + "step": 3227, + "time_per_iteration": 2.9805245399475098 + }, + { + "auxiliary_loss_clip": 0.01562858, + "auxiliary_loss_mlp": 0.01296232, + "balance_loss_clip": 1.21343887, + "balance_loss_mlp": 1.02367139, + "epoch": 0.38814405098298566, + "flos": 25263516778560.0, + "grad_norm": 2.056799294099982, + "language_loss": 0.79219621, + "learning_rate": 2.7988829823479924e-06, + "loss": 0.82078707, + "num_input_tokens_seen": 69493525, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 2.72851562, + "step": 3228, + "time_per_iteration": 3.9249908924102783 + }, + { + "auxiliary_loss_clip": 0.01551914, + "auxiliary_loss_mlp": 0.01293825, + "balance_loss_clip": 1.20147228, + "balance_loss_mlp": 1.02069283, + "epoch": 0.3882642938736247, + "flos": 18843049589760.0, + "grad_norm": 1.8590302702442245, + "language_loss": 0.64411104, + "learning_rate": 2.7981687927443976e-06, + "loss": 0.67256844, + "num_input_tokens_seen": 69510325, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.734375, + "step": 3229, + "time_per_iteration": 2.9576117992401123 + }, + { + "auxiliary_loss_clip": 0.01557591, + "auxiliary_loss_mlp": 0.01300493, + "balance_loss_clip": 1.2078433, + "balance_loss_mlp": 1.0292685, + "epoch": 0.3883845367642638, + "flos": 21654957640320.0, + "grad_norm": 2.4000631148693063, + "language_loss": 0.85713238, + "learning_rate": 2.797454482059231e-06, + "loss": 0.88571316, + "num_input_tokens_seen": 69530480, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 2.71484375, + "step": 3230, + "time_per_iteration": 3.0258188247680664 + }, + { + "auxiliary_loss_clip": 0.01558296, + "auxiliary_loss_mlp": 0.01297425, + "balance_loss_clip": 1.20928466, + "balance_loss_mlp": 1.02810717, + "epoch": 0.3885047796549029, + "flos": 20559581449920.0, + "grad_norm": 2.6006772918904257, + "language_loss": 0.8451494, + "learning_rate": 2.7967400504008537e-06, + "loss": 0.87370664, + "num_input_tokens_seen": 69549780, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 2.6953125, + "step": 3231, + "time_per_iteration": 2.975858449935913 + }, + { + "auxiliary_loss_clip": 0.01822124, + "auxiliary_loss_mlp": 0.012211, + "balance_loss_clip": 1.48385894, + "balance_loss_mlp": 1.00366211, + "epoch": 0.388625022545542, + "flos": 64331200958400.0, + "grad_norm": 0.794367147375512, + "language_loss": 0.57461816, + "learning_rate": 2.7960254978776456e-06, + "loss": 0.60505038, + "num_input_tokens_seen": 69611870, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 2.1796875, + "step": 3232, + "time_per_iteration": 4.290895700454712 + }, + { + "auxiliary_loss_clip": 0.0156472, + "auxiliary_loss_mlp": 0.01301461, + "balance_loss_clip": 1.2136296, + "balance_loss_mlp": 1.02890134, + "epoch": 0.3887452654361811, + "flos": 18115845691680.0, + "grad_norm": 2.602978106257043, + "language_loss": 0.82232565, + "learning_rate": 2.7953108245980006e-06, + "loss": 0.85098743, + "num_input_tokens_seen": 69630385, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.72851562, + "step": 3233, + "time_per_iteration": 3.803783655166626 + }, + { + "auxiliary_loss_clip": 0.01565926, + "auxiliary_loss_mlp": 0.01298021, + "balance_loss_clip": 1.2162118, + "balance_loss_mlp": 1.02698708, + "epoch": 0.38886550832682015, + "flos": 24977725178400.0, + "grad_norm": 2.041885630873221, + "language_loss": 0.73767161, + "learning_rate": 2.7945960306703365e-06, + "loss": 0.76631111, + "num_input_tokens_seen": 69653370, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.71289062, + "step": 3234, + "time_per_iteration": 3.066056489944458 + }, + { + "auxiliary_loss_clip": 0.01549158, + "auxiliary_loss_mlp": 0.01305304, + "balance_loss_clip": 1.19880855, + "balance_loss_mlp": 1.03274393, + "epoch": 0.38898575121745926, + "flos": 27202006416960.0, + "grad_norm": 1.7921973822288069, + "language_loss": 0.65733457, + "learning_rate": 2.7938811162030865e-06, + "loss": 0.68587923, + "num_input_tokens_seen": 69673635, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 2.72851562, + "step": 3235, + "time_per_iteration": 3.010012149810791 + }, + { + "auxiliary_loss_clip": 0.01555739, + "auxiliary_loss_mlp": 0.01295514, + "balance_loss_clip": 1.20572138, + "balance_loss_mlp": 1.02619672, + "epoch": 0.3891059941080984, + "flos": 28766065616640.0, + "grad_norm": 2.521941114124012, + "language_loss": 0.82527924, + "learning_rate": 2.793166081304702e-06, + "loss": 0.85379183, + "num_input_tokens_seen": 69694130, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 2.6953125, + "step": 3236, + "time_per_iteration": 3.0738048553466797 + }, + { + "auxiliary_loss_clip": 0.0155078, + "auxiliary_loss_mlp": 0.01301715, + "balance_loss_clip": 1.19968104, + "balance_loss_mlp": 1.02877319, + "epoch": 0.38922623699873743, + "flos": 22895675997120.0, + "grad_norm": 2.413230210138136, + "language_loss": 0.82313704, + "learning_rate": 2.7924509260836543e-06, + "loss": 0.85166204, + "num_input_tokens_seen": 69713255, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.73242188, + "step": 3237, + "time_per_iteration": 3.0839953422546387 + }, + { + "auxiliary_loss_clip": 0.01546432, + "auxiliary_loss_mlp": 0.01298421, + "balance_loss_clip": 1.19531214, + "balance_loss_mlp": 1.02719581, + "epoch": 0.38934647988937654, + "flos": 19794486955680.0, + "grad_norm": 1.5552276904491371, + "language_loss": 0.68454057, + "learning_rate": 2.791735650648431e-06, + "loss": 0.71298909, + "num_input_tokens_seen": 69732375, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.71484375, + "step": 3238, + "time_per_iteration": 3.079160690307617 + }, + { + "auxiliary_loss_clip": 0.01541198, + "auxiliary_loss_mlp": 0.01292591, + "balance_loss_clip": 1.1893332, + "balance_loss_mlp": 1.02079356, + "epoch": 0.38946672278001565, + "flos": 19203750034560.0, + "grad_norm": 2.406778323964514, + "language_loss": 0.74857634, + "learning_rate": 2.791020255107538e-06, + "loss": 0.77691424, + "num_input_tokens_seen": 69749745, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.72070312, + "step": 3239, + "time_per_iteration": 3.052722454071045 + }, + { + "auxiliary_loss_clip": 0.01550982, + "auxiliary_loss_mlp": 0.01297714, + "balance_loss_clip": 1.20021892, + "balance_loss_mlp": 1.02458155, + "epoch": 0.3895869656706547, + "flos": 24938810521920.0, + "grad_norm": 1.741937973832126, + "language_loss": 0.80549878, + "learning_rate": 2.7903047395695023e-06, + "loss": 0.83398569, + "num_input_tokens_seen": 69769645, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 2.734375, + "step": 3240, + "time_per_iteration": 3.0738627910614014 + }, + { + "auxiliary_loss_clip": 0.0155379, + "auxiliary_loss_mlp": 0.01292961, + "balance_loss_clip": 1.20225728, + "balance_loss_mlp": 1.01906586, + "epoch": 0.3897072085612938, + "flos": 24135901287840.0, + "grad_norm": 2.491866651829563, + "language_loss": 0.90608233, + "learning_rate": 2.789589104142865e-06, + "loss": 0.93454987, + "num_input_tokens_seen": 69787270, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.7421875, + "step": 3241, + "time_per_iteration": 3.030667304992676 + }, + { + "auxiliary_loss_clip": 0.01544196, + "auxiliary_loss_mlp": 0.01303468, + "balance_loss_clip": 1.19474316, + "balance_loss_mlp": 1.03014457, + "epoch": 0.3898274514519329, + "flos": 17168314926240.0, + "grad_norm": 1.8635088083516322, + "language_loss": 0.76481408, + "learning_rate": 2.7888733489361895e-06, + "loss": 0.79329073, + "num_input_tokens_seen": 69805685, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 2.73632812, + "step": 3242, + "time_per_iteration": 3.0920608043670654 + }, + { + "auxiliary_loss_clip": 0.01790795, + "auxiliary_loss_mlp": 0.01206261, + "balance_loss_clip": 1.44922388, + "balance_loss_mlp": 0.98958588, + "epoch": 0.389947694342572, + "flos": 66080503113120.0, + "grad_norm": 0.7389693329682636, + "language_loss": 0.5869537, + "learning_rate": 2.788157474058054e-06, + "loss": 0.61692423, + "num_input_tokens_seen": 69867960, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 2.171875, + "step": 3243, + "time_per_iteration": 3.5522053241729736 + }, + { + "auxiliary_loss_clip": 0.01544175, + "auxiliary_loss_mlp": 0.01295873, + "balance_loss_clip": 1.19302511, + "balance_loss_mlp": 1.02693701, + "epoch": 0.3900679372332111, + "flos": 25743388595040.0, + "grad_norm": 2.1566044290920274, + "language_loss": 0.70073503, + "learning_rate": 2.7874414796170555e-06, + "loss": 0.72913551, + "num_input_tokens_seen": 69889450, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.69140625, + "step": 3244, + "time_per_iteration": 2.947185754776001 + }, + { + "auxiliary_loss_clip": 0.01549449, + "auxiliary_loss_mlp": 0.01319645, + "balance_loss_clip": 1.19851983, + "balance_loss_mlp": 1.0465132, + "epoch": 0.3901881801238502, + "flos": 11803209354720.0, + "grad_norm": 2.6750941927501097, + "language_loss": 0.83562154, + "learning_rate": 2.7867253657218113e-06, + "loss": 0.86431253, + "num_input_tokens_seen": 69903340, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.734375, + "step": 3245, + "time_per_iteration": 3.009140968322754 + }, + { + "auxiliary_loss_clip": 0.01540419, + "auxiliary_loss_mlp": 0.01310978, + "balance_loss_clip": 1.18799973, + "balance_loss_mlp": 1.03593791, + "epoch": 0.39030842301448926, + "flos": 27311468179680.0, + "grad_norm": 2.035911631045196, + "language_loss": 0.73000014, + "learning_rate": 2.7860091324809544e-06, + "loss": 0.75851405, + "num_input_tokens_seen": 69924400, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.75390625, + "step": 3246, + "time_per_iteration": 2.982198715209961 + }, + { + "auxiliary_loss_clip": 0.01545099, + "auxiliary_loss_mlp": 0.01296648, + "balance_loss_clip": 1.19320858, + "balance_loss_mlp": 1.02637625, + "epoch": 0.39042866590512837, + "flos": 27165860516160.0, + "grad_norm": 2.151794369328936, + "language_loss": 0.80856788, + "learning_rate": 2.7852927800031377e-06, + "loss": 0.83698529, + "num_input_tokens_seen": 69944565, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.70507812, + "step": 3247, + "time_per_iteration": 3.0395007133483887 + }, + { + "auxiliary_loss_clip": 0.01539472, + "auxiliary_loss_mlp": 0.013039, + "balance_loss_clip": 1.18724155, + "balance_loss_mlp": 1.03153074, + "epoch": 0.3905489087957674, + "flos": 29718716683680.0, + "grad_norm": 1.7556043745236343, + "language_loss": 0.82850969, + "learning_rate": 2.7845763083970298e-06, + "loss": 0.85694337, + "num_input_tokens_seen": 69964965, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.7265625, + "step": 3248, + "time_per_iteration": 3.1649439334869385 + }, + { + "auxiliary_loss_clip": 0.01539629, + "auxiliary_loss_mlp": 0.01300405, + "balance_loss_clip": 1.18669343, + "balance_loss_mlp": 1.02536559, + "epoch": 0.39066915168640653, + "flos": 24500925542880.0, + "grad_norm": 3.0524726027582476, + "language_loss": 0.82130873, + "learning_rate": 2.7838597177713205e-06, + "loss": 0.84970903, + "num_input_tokens_seen": 69986055, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.75390625, + "step": 3249, + "time_per_iteration": 2.9809036254882812 + }, + { + "auxiliary_loss_clip": 0.0154059, + "auxiliary_loss_mlp": 0.01293919, + "balance_loss_clip": 1.18924904, + "balance_loss_mlp": 1.02174044, + "epoch": 0.39078939457704565, + "flos": 20560681366560.0, + "grad_norm": 1.9460260897854706, + "language_loss": 0.73645103, + "learning_rate": 2.7831430082347143e-06, + "loss": 0.76479614, + "num_input_tokens_seen": 70005260, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.72460938, + "step": 3250, + "time_per_iteration": 3.083979368209839 + }, + { + "auxiliary_loss_clip": 0.0155413, + "auxiliary_loss_mlp": 0.01306484, + "balance_loss_clip": 1.20222104, + "balance_loss_mlp": 1.0371666, + "epoch": 0.3909096374676847, + "flos": 22785607383840.0, + "grad_norm": 2.0753176074706667, + "language_loss": 0.82146966, + "learning_rate": 2.7824261798959373e-06, + "loss": 0.85007584, + "num_input_tokens_seen": 70023440, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.6953125, + "step": 3251, + "time_per_iteration": 2.9582183361053467 + }, + { + "auxiliary_loss_clip": 0.01546735, + "auxiliary_loss_mlp": 0.01290295, + "balance_loss_clip": 1.19456232, + "balance_loss_mlp": 1.0186882, + "epoch": 0.3910298803583238, + "flos": 23005365328800.0, + "grad_norm": 2.7666001815389807, + "language_loss": 0.79842567, + "learning_rate": 2.78170923286373e-06, + "loss": 0.82679594, + "num_input_tokens_seen": 70043040, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.71875, + "step": 3252, + "time_per_iteration": 3.0476489067077637 + }, + { + "auxiliary_loss_clip": 0.01551084, + "auxiliary_loss_mlp": 0.01302619, + "balance_loss_clip": 1.19975245, + "balance_loss_mlp": 1.03044009, + "epoch": 0.3911501232489629, + "flos": 24318565128000.0, + "grad_norm": 2.820249383871645, + "language_loss": 0.84216762, + "learning_rate": 2.780992167246854e-06, + "loss": 0.87070465, + "num_input_tokens_seen": 70060565, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.72460938, + "step": 3253, + "time_per_iteration": 3.0059618949890137 + }, + { + "auxiliary_loss_clip": 0.01785427, + "auxiliary_loss_mlp": 0.01240936, + "balance_loss_clip": 1.44171572, + "balance_loss_mlp": 1.02120972, + "epoch": 0.391270366139602, + "flos": 60875114480640.0, + "grad_norm": 0.9837488354921585, + "language_loss": 0.72119343, + "learning_rate": 2.7802749831540883e-06, + "loss": 0.75145698, + "num_input_tokens_seen": 70119465, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 2.203125, + "step": 3254, + "time_per_iteration": 4.370385646820068 + }, + { + "auxiliary_loss_clip": 0.01548655, + "auxiliary_loss_mlp": 0.01287453, + "balance_loss_clip": 1.19766533, + "balance_loss_mlp": 1.01775408, + "epoch": 0.3913906090302411, + "flos": 21545647590240.0, + "grad_norm": 2.484847807430956, + "language_loss": 0.82634306, + "learning_rate": 2.7795576806942268e-06, + "loss": 0.85470408, + "num_input_tokens_seen": 70138270, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 2.69921875, + "step": 3255, + "time_per_iteration": 3.914132833480835 + }, + { + "auxiliary_loss_clip": 0.01783761, + "auxiliary_loss_mlp": 0.01227241, + "balance_loss_clip": 1.44108105, + "balance_loss_mlp": 1.00904083, + "epoch": 0.3915108519208802, + "flos": 49844812821120.0, + "grad_norm": 0.7636412676681913, + "language_loss": 0.54844701, + "learning_rate": 2.778840259976085e-06, + "loss": 0.57855701, + "num_input_tokens_seen": 70193500, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 2.1875, + "step": 3256, + "time_per_iteration": 3.383857488632202 + }, + { + "auxiliary_loss_clip": 0.01544974, + "auxiliary_loss_mlp": 0.01322134, + "balance_loss_clip": 1.19244146, + "balance_loss_mlp": 1.044806, + "epoch": 0.39163109481151925, + "flos": 16508206671840.0, + "grad_norm": 2.432974589678723, + "language_loss": 0.76824373, + "learning_rate": 2.778122721108495e-06, + "loss": 0.79691482, + "num_input_tokens_seen": 70211730, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.77734375, + "step": 3257, + "time_per_iteration": 3.0789921283721924 + }, + { + "auxiliary_loss_clip": 0.01558602, + "auxiliary_loss_mlp": 0.01300615, + "balance_loss_clip": 1.20668435, + "balance_loss_mlp": 1.02958035, + "epoch": 0.39175133770215836, + "flos": 26069877475200.0, + "grad_norm": 7.383696271032083, + "language_loss": 0.88348478, + "learning_rate": 2.7774050642003076e-06, + "loss": 0.91207695, + "num_input_tokens_seen": 70232540, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.71289062, + "step": 3258, + "time_per_iteration": 3.0684397220611572 + }, + { + "auxiliary_loss_clip": 0.01547227, + "auxiliary_loss_mlp": 0.0130062, + "balance_loss_clip": 1.1955682, + "balance_loss_mlp": 1.02557993, + "epoch": 0.3918715805927975, + "flos": 21874260447360.0, + "grad_norm": 3.554446372559534, + "language_loss": 0.93359178, + "learning_rate": 2.7766872893603896e-06, + "loss": 0.96207017, + "num_input_tokens_seen": 70252515, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.75390625, + "step": 3259, + "time_per_iteration": 3.8588879108428955 + }, + { + "auxiliary_loss_clip": 0.01550462, + "auxiliary_loss_mlp": 0.01300744, + "balance_loss_clip": 1.19840479, + "balance_loss_mlp": 1.03237963, + "epoch": 0.39199182348343653, + "flos": 20378283023520.0, + "grad_norm": 1.8305009682012936, + "language_loss": 0.73240018, + "learning_rate": 2.7759693966976275e-06, + "loss": 0.7609123, + "num_input_tokens_seen": 70271020, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.68554688, + "step": 3260, + "time_per_iteration": 3.9550082683563232 + }, + { + "auxiliary_loss_clip": 0.01545688, + "auxiliary_loss_mlp": 0.01303424, + "balance_loss_clip": 1.19403458, + "balance_loss_mlp": 1.02933848, + "epoch": 0.39211206637407564, + "flos": 21685376388960.0, + "grad_norm": 9.680978142810773, + "language_loss": 0.85075581, + "learning_rate": 2.7752513863209242e-06, + "loss": 0.87924695, + "num_input_tokens_seen": 70289600, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.74414062, + "step": 3261, + "time_per_iteration": 3.005427837371826 + }, + { + "auxiliary_loss_clip": 0.01553606, + "auxiliary_loss_mlp": 0.01297572, + "balance_loss_clip": 1.202317, + "balance_loss_mlp": 1.02806413, + "epoch": 0.39223230926471475, + "flos": 21068241104160.0, + "grad_norm": 2.7850229093258436, + "language_loss": 0.84371245, + "learning_rate": 2.774533258339203e-06, + "loss": 0.87222421, + "num_input_tokens_seen": 70307060, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.69726562, + "step": 3262, + "time_per_iteration": 2.9755659103393555 + }, + { + "auxiliary_loss_clip": 0.01544668, + "auxiliary_loss_mlp": 0.01299457, + "balance_loss_clip": 1.193331, + "balance_loss_mlp": 1.02670634, + "epoch": 0.3923525521553538, + "flos": 17604948276000.0, + "grad_norm": 2.719647899904433, + "language_loss": 0.79636598, + "learning_rate": 2.7738150128614014e-06, + "loss": 0.82480723, + "num_input_tokens_seen": 70324465, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.73046875, + "step": 3263, + "time_per_iteration": 2.9608864784240723 + }, + { + "auxiliary_loss_clip": 0.01545886, + "auxiliary_loss_mlp": 0.01304821, + "balance_loss_clip": 1.19353676, + "balance_loss_mlp": 1.03493083, + "epoch": 0.3924727950459929, + "flos": 20560377941280.0, + "grad_norm": 1.8761565684665424, + "language_loss": 0.89776528, + "learning_rate": 2.7730966499964777e-06, + "loss": 0.92627239, + "num_input_tokens_seen": 70341415, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.70117188, + "step": 3264, + "time_per_iteration": 2.8959360122680664 + }, + { + "auxiliary_loss_clip": 0.01542051, + "auxiliary_loss_mlp": 0.01296699, + "balance_loss_clip": 1.19018626, + "balance_loss_mlp": 1.02471089, + "epoch": 0.39259303793663197, + "flos": 16217939548800.0, + "grad_norm": 3.2076660642602457, + "language_loss": 0.81095421, + "learning_rate": 2.772378169853408e-06, + "loss": 0.83934176, + "num_input_tokens_seen": 70358985, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.72265625, + "step": 3265, + "time_per_iteration": 2.969569683074951 + }, + { + "auxiliary_loss_clip": 0.01551169, + "auxiliary_loss_mlp": 0.01293558, + "balance_loss_clip": 1.20029259, + "balance_loss_mlp": 1.02347755, + "epoch": 0.3927132808272711, + "flos": 16798701363840.0, + "grad_norm": 2.449552921340061, + "language_loss": 0.74702764, + "learning_rate": 2.771659572541183e-06, + "loss": 0.77547491, + "num_input_tokens_seen": 70376915, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 2.703125, + "step": 3266, + "time_per_iteration": 2.9010801315307617 + }, + { + "auxiliary_loss_clip": 0.01554238, + "auxiliary_loss_mlp": 0.013054, + "balance_loss_clip": 1.20468616, + "balance_loss_mlp": 1.03493774, + "epoch": 0.3928335237179102, + "flos": 20269352255040.0, + "grad_norm": 2.2074810218284107, + "language_loss": 0.87019122, + "learning_rate": 2.7709408581688143e-06, + "loss": 0.89878762, + "num_input_tokens_seen": 70396900, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 2.70703125, + "step": 3267, + "time_per_iteration": 3.087674856185913 + }, + { + "auxiliary_loss_clip": 0.01552508, + "auxiliary_loss_mlp": 0.01295239, + "balance_loss_clip": 1.20217705, + "balance_loss_mlp": 1.02515864, + "epoch": 0.39295376660854925, + "flos": 24975601201440.0, + "grad_norm": 1.6122518981154796, + "language_loss": 0.87923682, + "learning_rate": 2.7702220268453307e-06, + "loss": 0.90771425, + "num_input_tokens_seen": 70417260, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 2.703125, + "step": 3268, + "time_per_iteration": 2.9810452461242676 + }, + { + "auxiliary_loss_clip": 0.01546709, + "auxiliary_loss_mlp": 0.01301609, + "balance_loss_clip": 1.19593751, + "balance_loss_mlp": 1.02752268, + "epoch": 0.39307400949918836, + "flos": 18699793472160.0, + "grad_norm": 2.1671798628334824, + "language_loss": 0.8431561, + "learning_rate": 2.7695030786797785e-06, + "loss": 0.87163931, + "num_input_tokens_seen": 70433155, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.74414062, + "step": 3269, + "time_per_iteration": 2.9188339710235596 + }, + { + "auxiliary_loss_clip": 0.01545659, + "auxiliary_loss_mlp": 0.01292091, + "balance_loss_clip": 1.19485474, + "balance_loss_mlp": 1.02372742, + "epoch": 0.39319425238982747, + "flos": 22417473019680.0, + "grad_norm": 2.0228124199061748, + "language_loss": 0.74437106, + "learning_rate": 2.7687840137812206e-06, + "loss": 0.77274853, + "num_input_tokens_seen": 70451240, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 2.68554688, + "step": 3270, + "time_per_iteration": 2.9826908111572266 + }, + { + "auxiliary_loss_clip": 0.01779012, + "auxiliary_loss_mlp": 0.01212708, + "balance_loss_clip": 1.4385004, + "balance_loss_mlp": 0.99755859, + "epoch": 0.3933144952804665, + "flos": 66199143490560.0, + "grad_norm": 0.8012421629768335, + "language_loss": 0.62049186, + "learning_rate": 2.7680648322587395e-06, + "loss": 0.65040898, + "num_input_tokens_seen": 70516115, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 2.15625, + "step": 3271, + "time_per_iteration": 3.5452380180358887 + }, + { + "auxiliary_loss_clip": 0.01549734, + "auxiliary_loss_mlp": 0.01302666, + "balance_loss_clip": 1.19845915, + "balance_loss_mlp": 1.03086853, + "epoch": 0.39343473817110564, + "flos": 15488990955360.0, + "grad_norm": 2.420636324613391, + "language_loss": 0.80749202, + "learning_rate": 2.7673455342214334e-06, + "loss": 0.83601606, + "num_input_tokens_seen": 70533105, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.72070312, + "step": 3272, + "time_per_iteration": 3.096423864364624 + }, + { + "auxiliary_loss_clip": 0.01540814, + "auxiliary_loss_mlp": 0.01300788, + "balance_loss_clip": 1.18890691, + "balance_loss_mlp": 1.02994466, + "epoch": 0.39355498106174475, + "flos": 21327065418240.0, + "grad_norm": 8.511335672017738, + "language_loss": 0.76045674, + "learning_rate": 2.7666261197784198e-06, + "loss": 0.78887278, + "num_input_tokens_seen": 70551920, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.7109375, + "step": 3273, + "time_per_iteration": 3.0250329971313477 + }, + { + "auxiliary_loss_clip": 0.01535963, + "auxiliary_loss_mlp": 0.01293565, + "balance_loss_clip": 1.18503773, + "balance_loss_mlp": 1.02367473, + "epoch": 0.3936752239523838, + "flos": 13297973077440.0, + "grad_norm": 2.076369251260922, + "language_loss": 0.76230454, + "learning_rate": 2.7659065890388336e-06, + "loss": 0.79059982, + "num_input_tokens_seen": 70567920, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.70117188, + "step": 3274, + "time_per_iteration": 2.986685276031494 + }, + { + "auxiliary_loss_clip": 0.01539842, + "auxiliary_loss_mlp": 0.01298985, + "balance_loss_clip": 1.18826795, + "balance_loss_mlp": 1.02890468, + "epoch": 0.3937954668430229, + "flos": 16802039041920.0, + "grad_norm": 2.288319856573703, + "language_loss": 0.85119826, + "learning_rate": 2.7651869421118266e-06, + "loss": 0.87958652, + "num_input_tokens_seen": 70584530, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.703125, + "step": 3275, + "time_per_iteration": 3.048922300338745 + }, + { + "auxiliary_loss_clip": 0.01550469, + "auxiliary_loss_mlp": 0.01302398, + "balance_loss_clip": 1.19847989, + "balance_loss_mlp": 1.0338428, + "epoch": 0.393915709733662, + "flos": 21066079199040.0, + "grad_norm": 2.4965274637455046, + "language_loss": 0.83285749, + "learning_rate": 2.76446717910657e-06, + "loss": 0.86138612, + "num_input_tokens_seen": 70605235, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.6875, + "step": 3276, + "time_per_iteration": 3.0008814334869385 + }, + { + "auxiliary_loss_clip": 0.01539828, + "auxiliary_loss_mlp": 0.01292023, + "balance_loss_clip": 1.1882062, + "balance_loss_mlp": 1.02022552, + "epoch": 0.3940359526243011, + "flos": 17167177081440.0, + "grad_norm": 3.8453679661787503, + "language_loss": 0.76650059, + "learning_rate": 2.763747300132249e-06, + "loss": 0.79481906, + "num_input_tokens_seen": 70622675, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.72070312, + "step": 3277, + "time_per_iteration": 3.189793825149536 + }, + { + "auxiliary_loss_clip": 0.01537957, + "auxiliary_loss_mlp": 0.01300039, + "balance_loss_clip": 1.18560624, + "balance_loss_mlp": 1.0295769, + "epoch": 0.3941561955149402, + "flos": 20997959495040.0, + "grad_norm": 2.193619593091703, + "language_loss": 0.86965382, + "learning_rate": 2.7630273052980704e-06, + "loss": 0.8980338, + "num_input_tokens_seen": 70643265, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.70703125, + "step": 3278, + "time_per_iteration": 3.019124984741211 + }, + { + "auxiliary_loss_clip": 0.0154199, + "auxiliary_loss_mlp": 0.01296271, + "balance_loss_clip": 1.18941832, + "balance_loss_mlp": 1.02886057, + "epoch": 0.39427643840557924, + "flos": 18845401135680.0, + "grad_norm": 2.6443595075988693, + "language_loss": 0.66995525, + "learning_rate": 2.7623071947132554e-06, + "loss": 0.69833779, + "num_input_tokens_seen": 70660295, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.67578125, + "step": 3279, + "time_per_iteration": 3.004490852355957 + }, + { + "auxiliary_loss_clip": 0.01542982, + "auxiliary_loss_mlp": 0.01294524, + "balance_loss_clip": 1.19130898, + "balance_loss_mlp": 1.02482498, + "epoch": 0.39439668129621835, + "flos": 23260927821120.0, + "grad_norm": 2.5501084682616035, + "language_loss": 0.79064763, + "learning_rate": 2.7615869684870458e-06, + "loss": 0.81902266, + "num_input_tokens_seen": 70679605, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.69921875, + "step": 3280, + "time_per_iteration": 2.94917368888855 + }, + { + "auxiliary_loss_clip": 0.01544283, + "auxiliary_loss_mlp": 0.01304877, + "balance_loss_clip": 1.19158435, + "balance_loss_mlp": 1.03308034, + "epoch": 0.39451692418685746, + "flos": 26654773459680.0, + "grad_norm": 2.3313836829007077, + "language_loss": 0.85045636, + "learning_rate": 2.7608666267286986e-06, + "loss": 0.87894797, + "num_input_tokens_seen": 70699835, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.72070312, + "step": 3281, + "time_per_iteration": 3.8301467895507812 + }, + { + "auxiliary_loss_clip": 0.01534387, + "auxiliary_loss_mlp": 0.01300818, + "balance_loss_clip": 1.18223214, + "balance_loss_mlp": 1.02806711, + "epoch": 0.3946371670774965, + "flos": 18260618935680.0, + "grad_norm": 12.905300079330544, + "language_loss": 0.86826503, + "learning_rate": 2.760146169547489e-06, + "loss": 0.89661705, + "num_input_tokens_seen": 70716600, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.73046875, + "step": 3282, + "time_per_iteration": 2.9054975509643555 + }, + { + "auxiliary_loss_clip": 0.01544509, + "auxiliary_loss_mlp": 0.01303992, + "balance_loss_clip": 1.19228053, + "balance_loss_mlp": 1.03314841, + "epoch": 0.39475740996813563, + "flos": 24208382730240.0, + "grad_norm": 2.6662928365733425, + "language_loss": 0.76495123, + "learning_rate": 2.75942559705271e-06, + "loss": 0.79343629, + "num_input_tokens_seen": 70736335, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.7109375, + "step": 3283, + "time_per_iteration": 3.935056209564209 + }, + { + "auxiliary_loss_clip": 0.01534, + "auxiliary_loss_mlp": 0.01304116, + "balance_loss_clip": 1.18164933, + "balance_loss_mlp": 1.03136563, + "epoch": 0.39487765285877474, + "flos": 19319887153440.0, + "grad_norm": 2.4387309073601617, + "language_loss": 0.89193875, + "learning_rate": 2.7587049093536713e-06, + "loss": 0.92031991, + "num_input_tokens_seen": 70752665, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.73046875, + "step": 3284, + "time_per_iteration": 2.9778151512145996 + }, + { + "auxiliary_loss_clip": 0.01537651, + "auxiliary_loss_mlp": 0.01304416, + "balance_loss_clip": 1.18645513, + "balance_loss_mlp": 1.03357244, + "epoch": 0.3949978957494138, + "flos": 17313695020800.0, + "grad_norm": 1.9805170671508605, + "language_loss": 0.80694413, + "learning_rate": 2.757984106559701e-06, + "loss": 0.8353647, + "num_input_tokens_seen": 70771650, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.7109375, + "step": 3285, + "time_per_iteration": 2.886073112487793 + }, + { + "auxiliary_loss_clip": 0.01542673, + "auxiliary_loss_mlp": 0.0130162, + "balance_loss_clip": 1.19046712, + "balance_loss_mlp": 1.03115773, + "epoch": 0.3951181386400529, + "flos": 36320709867840.0, + "grad_norm": 2.356903789078566, + "language_loss": 0.71424353, + "learning_rate": 2.7572631887801446e-06, + "loss": 0.74268645, + "num_input_tokens_seen": 70793275, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.70703125, + "step": 3286, + "time_per_iteration": 3.916778087615967 + }, + { + "auxiliary_loss_clip": 0.01539588, + "auxiliary_loss_mlp": 0.01307748, + "balance_loss_clip": 1.19031882, + "balance_loss_mlp": 1.03385234, + "epoch": 0.395238381530692, + "flos": 23112475545600.0, + "grad_norm": 2.0427168588518945, + "language_loss": 0.76825863, + "learning_rate": 2.7565421561243654e-06, + "loss": 0.79673201, + "num_input_tokens_seen": 70811440, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.7421875, + "step": 3287, + "time_per_iteration": 2.938384532928467 + }, + { + "auxiliary_loss_clip": 0.01535515, + "auxiliary_loss_mlp": 0.0129244, + "balance_loss_clip": 1.18548644, + "balance_loss_mlp": 1.0242672, + "epoch": 0.3953586244213311, + "flos": 24349628655360.0, + "grad_norm": 4.220952088893747, + "language_loss": 0.82132733, + "learning_rate": 2.7558210087017413e-06, + "loss": 0.84960687, + "num_input_tokens_seen": 70831375, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 2.68359375, + "step": 3288, + "time_per_iteration": 3.851571798324585 + }, + { + "auxiliary_loss_clip": 0.01544531, + "auxiliary_loss_mlp": 0.01310816, + "balance_loss_clip": 1.19575667, + "balance_loss_mlp": 1.0401634, + "epoch": 0.3954788673119702, + "flos": 23442302103840.0, + "grad_norm": 1.9502423208108908, + "language_loss": 0.73605943, + "learning_rate": 2.7550997466216724e-06, + "loss": 0.76461279, + "num_input_tokens_seen": 70849170, + "router_z_loss_clip": 3.48828125, + "router_z_loss_mlp": 2.70898438, + "step": 3289, + "time_per_iteration": 3.017744541168213 + }, + { + "auxiliary_loss_clip": 0.01539996, + "auxiliary_loss_mlp": 0.01301418, + "balance_loss_clip": 1.18909013, + "balance_loss_mlp": 1.02962089, + "epoch": 0.3955991102026093, + "flos": 17496017507520.0, + "grad_norm": 2.1339126291016615, + "language_loss": 0.8138544, + "learning_rate": 2.7543783699935714e-06, + "loss": 0.84226847, + "num_input_tokens_seen": 70867200, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.72070312, + "step": 3290, + "time_per_iteration": 2.9694015979766846 + }, + { + "auxiliary_loss_clip": 0.01552625, + "auxiliary_loss_mlp": 0.01289337, + "balance_loss_clip": 1.20381892, + "balance_loss_mlp": 1.02040064, + "epoch": 0.39571935309324835, + "flos": 18223183477440.0, + "grad_norm": 3.376579766015456, + "language_loss": 0.86152571, + "learning_rate": 2.753656878926872e-06, + "loss": 0.88994533, + "num_input_tokens_seen": 70883080, + "router_z_loss_clip": 3.48828125, + "router_z_loss_mlp": 2.69140625, + "step": 3291, + "time_per_iteration": 2.949211835861206 + }, + { + "auxiliary_loss_clip": 0.01532739, + "auxiliary_loss_mlp": 0.01297119, + "balance_loss_clip": 1.18326128, + "balance_loss_mlp": 1.02379584, + "epoch": 0.39583959598388746, + "flos": 17750745580320.0, + "grad_norm": 3.0345462655558975, + "language_loss": 0.74458015, + "learning_rate": 2.752935273531023e-06, + "loss": 0.77287871, + "num_input_tokens_seen": 70901230, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 2.73632812, + "step": 3292, + "time_per_iteration": 3.0028793811798096 + }, + { + "auxiliary_loss_clip": 0.01534779, + "auxiliary_loss_mlp": 0.01300934, + "balance_loss_clip": 1.18544841, + "balance_loss_mlp": 1.02627599, + "epoch": 0.39595983887452657, + "flos": 19354288358880.0, + "grad_norm": 2.453822964748415, + "language_loss": 0.78312773, + "learning_rate": 2.752213553915492e-06, + "loss": 0.81148481, + "num_input_tokens_seen": 70919585, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 2.75, + "step": 3293, + "time_per_iteration": 2.9424502849578857 + }, + { + "auxiliary_loss_clip": 0.01680052, + "auxiliary_loss_mlp": 0.01241188, + "balance_loss_clip": 1.33877742, + "balance_loss_mlp": 1.02146149, + "epoch": 0.3960800817651656, + "flos": 60688088902080.0, + "grad_norm": 0.8205836096063617, + "language_loss": 0.66020095, + "learning_rate": 2.751491720189762e-06, + "loss": 0.68941337, + "num_input_tokens_seen": 70977695, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 2.203125, + "step": 3294, + "time_per_iteration": 3.4595136642456055 + }, + { + "auxiliary_loss_clip": 0.01537513, + "auxiliary_loss_mlp": 0.01288106, + "balance_loss_clip": 1.18689084, + "balance_loss_mlp": 1.01840663, + "epoch": 0.39620032465580474, + "flos": 16838602152480.0, + "grad_norm": 2.4174529772807465, + "language_loss": 0.91450673, + "learning_rate": 2.7507697724633364e-06, + "loss": 0.94276285, + "num_input_tokens_seen": 70994455, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 2.69921875, + "step": 3295, + "time_per_iteration": 3.0266900062561035 + }, + { + "auxiliary_loss_clip": 0.01665832, + "auxiliary_loss_mlp": 0.01213386, + "balance_loss_clip": 1.32446659, + "balance_loss_mlp": 0.99365997, + "epoch": 0.3963205675464438, + "flos": 69077578262400.0, + "grad_norm": 0.7757559516964, + "language_loss": 0.54633731, + "learning_rate": 2.7500477108457327e-06, + "loss": 0.57512951, + "num_input_tokens_seen": 71046465, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 2.203125, + "step": 3296, + "time_per_iteration": 3.325911521911621 + }, + { + "auxiliary_loss_clip": 0.01528356, + "auxiliary_loss_mlp": 0.01295138, + "balance_loss_clip": 1.17802, + "balance_loss_mlp": 1.02543902, + "epoch": 0.3964408104370829, + "flos": 25669883092320.0, + "grad_norm": 1.9328835411470633, + "language_loss": 0.80864602, + "learning_rate": 2.7493255354464877e-06, + "loss": 0.83688098, + "num_input_tokens_seen": 71064275, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 2.69921875, + "step": 3297, + "time_per_iteration": 3.1158828735351562 + }, + { + "auxiliary_loss_clip": 0.01530691, + "auxiliary_loss_mlp": 0.01290681, + "balance_loss_clip": 1.17986035, + "balance_loss_mlp": 1.02346194, + "epoch": 0.396561053327722, + "flos": 24279650471520.0, + "grad_norm": 3.230417279684729, + "language_loss": 0.76325238, + "learning_rate": 2.748603246375156e-06, + "loss": 0.79146612, + "num_input_tokens_seen": 71082290, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.67382812, + "step": 3298, + "time_per_iteration": 2.95499849319458 + }, + { + "auxiliary_loss_clip": 0.01528695, + "auxiliary_loss_mlp": 0.01302654, + "balance_loss_clip": 1.17888045, + "balance_loss_mlp": 1.03409886, + "epoch": 0.39668129621836107, + "flos": 20524421681280.0, + "grad_norm": 3.293336234485526, + "language_loss": 0.6994217, + "learning_rate": 2.7478808437413055e-06, + "loss": 0.72773516, + "num_input_tokens_seen": 71101700, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 2.6875, + "step": 3299, + "time_per_iteration": 2.942974805831909 + }, + { + "auxiliary_loss_clip": 0.01525238, + "auxiliary_loss_mlp": 0.01297588, + "balance_loss_clip": 1.17553043, + "balance_loss_mlp": 1.02598119, + "epoch": 0.3968015391090002, + "flos": 27055033339680.0, + "grad_norm": 1.9987129728560629, + "language_loss": 0.66013098, + "learning_rate": 2.7471583276545263e-06, + "loss": 0.68835926, + "num_input_tokens_seen": 71122360, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 2.71875, + "step": 3300, + "time_per_iteration": 2.986402750015259 + }, + { + "auxiliary_loss_clip": 0.01523498, + "auxiliary_loss_mlp": 0.01297107, + "balance_loss_clip": 1.172212, + "balance_loss_mlp": 1.027789, + "epoch": 0.3969217819996393, + "flos": 12533485433760.0, + "grad_norm": 2.1114324316196704, + "language_loss": 0.70353436, + "learning_rate": 2.7464356982244224e-06, + "loss": 0.73174047, + "num_input_tokens_seen": 71140360, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.6953125, + "step": 3301, + "time_per_iteration": 2.8926584720611572 + }, + { + "auxiliary_loss_clip": 0.01628588, + "auxiliary_loss_mlp": 0.01202278, + "balance_loss_clip": 1.28750467, + "balance_loss_mlp": 0.98636627, + "epoch": 0.39704202489027834, + "flos": 66248260822080.0, + "grad_norm": 0.7862587315877977, + "language_loss": 0.61729032, + "learning_rate": 2.745712955560617e-06, + "loss": 0.64559895, + "num_input_tokens_seen": 71196565, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 2.1640625, + "step": 3302, + "time_per_iteration": 3.389876127243042 + }, + { + "auxiliary_loss_clip": 0.01530904, + "auxiliary_loss_mlp": 0.01294546, + "balance_loss_clip": 1.18081164, + "balance_loss_mlp": 1.02274895, + "epoch": 0.39716226778091746, + "flos": 16984892522880.0, + "grad_norm": 4.754252105644524, + "language_loss": 0.77344149, + "learning_rate": 2.7449900997727496e-06, + "loss": 0.801696, + "num_input_tokens_seen": 71214675, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 2.72070312, + "step": 3303, + "time_per_iteration": 2.924755096435547 + }, + { + "auxiliary_loss_clip": 0.01531117, + "auxiliary_loss_mlp": 0.01293527, + "balance_loss_clip": 1.1813072, + "balance_loss_mlp": 1.02611685, + "epoch": 0.39728251067155657, + "flos": 23479396208640.0, + "grad_norm": 2.2934429661167854, + "language_loss": 0.84195209, + "learning_rate": 2.744267130970476e-06, + "loss": 0.87019855, + "num_input_tokens_seen": 71234400, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 2.67578125, + "step": 3304, + "time_per_iteration": 2.9345760345458984 + }, + { + "auxiliary_loss_clip": 0.01527879, + "auxiliary_loss_mlp": 0.01291413, + "balance_loss_clip": 1.17826509, + "balance_loss_mlp": 1.019997, + "epoch": 0.3974027535621956, + "flos": 20706554527200.0, + "grad_norm": 1.8276565051845741, + "language_loss": 0.77198613, + "learning_rate": 2.7435440492634697e-06, + "loss": 0.800179, + "num_input_tokens_seen": 71253725, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 2.71679688, + "step": 3305, + "time_per_iteration": 2.933089256286621 + }, + { + "auxiliary_loss_clip": 0.01524183, + "auxiliary_loss_mlp": 0.01302396, + "balance_loss_clip": 1.17381048, + "balance_loss_mlp": 1.02697492, + "epoch": 0.39752299645283473, + "flos": 21545420021280.0, + "grad_norm": 2.4681755814483686, + "language_loss": 0.6729883, + "learning_rate": 2.7428208547614228e-06, + "loss": 0.70125401, + "num_input_tokens_seen": 71273220, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 2.7578125, + "step": 3306, + "time_per_iteration": 3.076960325241089 + }, + { + "auxiliary_loss_clip": 0.01528654, + "auxiliary_loss_mlp": 0.01294387, + "balance_loss_clip": 1.17799151, + "balance_loss_mlp": 1.02545047, + "epoch": 0.39764323934347384, + "flos": 19210311606240.0, + "grad_norm": 2.10897100800255, + "language_loss": 0.76925933, + "learning_rate": 2.742097547574043e-06, + "loss": 0.79748976, + "num_input_tokens_seen": 71291445, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 2.69140625, + "step": 3307, + "time_per_iteration": 2.9811787605285645 + }, + { + "auxiliary_loss_clip": 0.01528101, + "auxiliary_loss_mlp": 0.01315978, + "balance_loss_clip": 1.17839742, + "balance_loss_mlp": 1.04151046, + "epoch": 0.3977634822341129, + "flos": 20852313903360.0, + "grad_norm": 2.0581747051638404, + "language_loss": 0.77884603, + "learning_rate": 2.7413741278110544e-06, + "loss": 0.8072868, + "num_input_tokens_seen": 71310135, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 2.74804688, + "step": 3308, + "time_per_iteration": 2.980928421020508 + }, + { + "auxiliary_loss_clip": 0.01530056, + "auxiliary_loss_mlp": 0.01316269, + "balance_loss_clip": 1.17996895, + "balance_loss_mlp": 1.04122889, + "epoch": 0.397883725124752, + "flos": 39789198853920.0, + "grad_norm": 3.9049465694694607, + "language_loss": 0.69231117, + "learning_rate": 2.7406505955822016e-06, + "loss": 0.72077441, + "num_input_tokens_seen": 71331160, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.75390625, + "step": 3309, + "time_per_iteration": 3.9367051124572754 + }, + { + "auxiliary_loss_clip": 0.01530282, + "auxiliary_loss_mlp": 0.01321601, + "balance_loss_clip": 1.17973268, + "balance_loss_mlp": 1.04846883, + "epoch": 0.39800396801539106, + "flos": 17381169946080.0, + "grad_norm": 7.349897207722698, + "language_loss": 0.66414142, + "learning_rate": 2.7399269509972415e-06, + "loss": 0.69266033, + "num_input_tokens_seen": 71345315, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.734375, + "step": 3310, + "time_per_iteration": 3.84965181350708 + }, + { + "auxiliary_loss_clip": 0.01521972, + "auxiliary_loss_mlp": 0.01307141, + "balance_loss_clip": 1.17184424, + "balance_loss_mlp": 1.03419924, + "epoch": 0.3981242109060302, + "flos": 19204470669600.0, + "grad_norm": 2.97320012896625, + "language_loss": 0.85258567, + "learning_rate": 2.7392031941659514e-06, + "loss": 0.88087678, + "num_input_tokens_seen": 71363160, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.73242188, + "step": 3311, + "time_per_iteration": 2.955687999725342 + }, + { + "auxiliary_loss_clip": 0.01530273, + "auxiliary_loss_mlp": 0.01322508, + "balance_loss_clip": 1.17998672, + "balance_loss_mlp": 1.04899383, + "epoch": 0.3982444537966693, + "flos": 24567072982560.0, + "grad_norm": 1.8111727456200828, + "language_loss": 0.85753322, + "learning_rate": 2.7384793251981244e-06, + "loss": 0.88606101, + "num_input_tokens_seen": 71382145, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 2.73828125, + "step": 3312, + "time_per_iteration": 3.0443742275238037 + }, + { + "auxiliary_loss_clip": 0.01523139, + "auxiliary_loss_mlp": 0.01301706, + "balance_loss_clip": 1.17293966, + "balance_loss_mlp": 1.02819252, + "epoch": 0.39836469668730834, + "flos": 26216281630080.0, + "grad_norm": 2.1482890378616974, + "language_loss": 0.81263155, + "learning_rate": 2.737755344203571e-06, + "loss": 0.84087998, + "num_input_tokens_seen": 71402095, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 2.73828125, + "step": 3313, + "time_per_iteration": 3.8668806552886963 + }, + { + "auxiliary_loss_clip": 0.01527157, + "auxiliary_loss_mlp": 0.01311029, + "balance_loss_clip": 1.17741799, + "balance_loss_mlp": 1.04075778, + "epoch": 0.39848493957794745, + "flos": 27638601838560.0, + "grad_norm": 1.5645439132422665, + "language_loss": 0.80088603, + "learning_rate": 2.7370312512921186e-06, + "loss": 0.82926786, + "num_input_tokens_seen": 71423875, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.70507812, + "step": 3314, + "time_per_iteration": 3.073753833770752 + }, + { + "auxiliary_loss_clip": 0.01521935, + "auxiliary_loss_mlp": 0.01308449, + "balance_loss_clip": 1.17054462, + "balance_loss_mlp": 1.03436291, + "epoch": 0.39860518246858656, + "flos": 12240942621120.0, + "grad_norm": 2.6123432958452524, + "language_loss": 0.76732612, + "learning_rate": 2.736307046573611e-06, + "loss": 0.79562998, + "num_input_tokens_seen": 71439745, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.74414062, + "step": 3315, + "time_per_iteration": 3.1145200729370117 + }, + { + "auxiliary_loss_clip": 0.01522183, + "auxiliary_loss_mlp": 0.01306944, + "balance_loss_clip": 1.17110395, + "balance_loss_mlp": 1.03514719, + "epoch": 0.3987254253592256, + "flos": 22381137478080.0, + "grad_norm": 1.6310024565328438, + "language_loss": 0.82126182, + "learning_rate": 2.73558273015791e-06, + "loss": 0.84955311, + "num_input_tokens_seen": 71459575, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 2.72070312, + "step": 3316, + "time_per_iteration": 3.931351900100708 + }, + { + "auxiliary_loss_clip": 0.01525616, + "auxiliary_loss_mlp": 0.01308495, + "balance_loss_clip": 1.17637026, + "balance_loss_mlp": 1.03631628, + "epoch": 0.3988456682498647, + "flos": 23516300672640.0, + "grad_norm": 3.4564166777754255, + "language_loss": 0.70193112, + "learning_rate": 2.734858302154894e-06, + "loss": 0.73027223, + "num_input_tokens_seen": 71481075, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 2.72460938, + "step": 3317, + "time_per_iteration": 3.0108325481414795 + }, + { + "auxiliary_loss_clip": 0.01520975, + "auxiliary_loss_mlp": 0.01296941, + "balance_loss_clip": 1.17081618, + "balance_loss_mlp": 1.02628827, + "epoch": 0.39896591114050384, + "flos": 19210804672320.0, + "grad_norm": 2.5698098934393854, + "language_loss": 0.76428765, + "learning_rate": 2.734133762674457e-06, + "loss": 0.79246682, + "num_input_tokens_seen": 71500665, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 2.70898438, + "step": 3318, + "time_per_iteration": 2.903085470199585 + }, + { + "auxiliary_loss_clip": 0.01521455, + "auxiliary_loss_mlp": 0.01296272, + "balance_loss_clip": 1.16923451, + "balance_loss_mlp": 1.02371216, + "epoch": 0.3990861540311429, + "flos": 28403468763840.0, + "grad_norm": 2.4570675830709576, + "language_loss": 0.71050203, + "learning_rate": 2.7334091118265124e-06, + "loss": 0.73867929, + "num_input_tokens_seen": 71522560, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.72851562, + "step": 3319, + "time_per_iteration": 3.0527684688568115 + }, + { + "auxiliary_loss_clip": 0.01606964, + "auxiliary_loss_mlp": 0.01266769, + "balance_loss_clip": 1.26424885, + "balance_loss_mlp": 1.04704285, + "epoch": 0.399206396921782, + "flos": 61765448852160.0, + "grad_norm": 0.7006102087604598, + "language_loss": 0.57806873, + "learning_rate": 2.732684349720989e-06, + "loss": 0.60680604, + "num_input_tokens_seen": 71590520, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 2.203125, + "step": 3320, + "time_per_iteration": 3.437917709350586 + }, + { + "auxiliary_loss_clip": 0.01519756, + "auxiliary_loss_mlp": 0.01300819, + "balance_loss_clip": 1.16979861, + "balance_loss_mlp": 1.02844977, + "epoch": 0.3993266398124211, + "flos": 28077093668160.0, + "grad_norm": 1.9014677477260014, + "language_loss": 0.75590444, + "learning_rate": 2.7319594764678318e-06, + "loss": 0.78411025, + "num_input_tokens_seen": 71612620, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 2.7265625, + "step": 3321, + "time_per_iteration": 2.9757416248321533 + }, + { + "auxiliary_loss_clip": 0.01518617, + "auxiliary_loss_mlp": 0.01289339, + "balance_loss_clip": 1.16718853, + "balance_loss_mlp": 1.0183053, + "epoch": 0.39944688270306017, + "flos": 23227512747840.0, + "grad_norm": 2.414487054448485, + "language_loss": 0.83572459, + "learning_rate": 2.7312344921770044e-06, + "loss": 0.86380416, + "num_input_tokens_seen": 71634320, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.71289062, + "step": 3322, + "time_per_iteration": 2.9548842906951904 + }, + { + "auxiliary_loss_clip": 0.01522873, + "auxiliary_loss_mlp": 0.0129349, + "balance_loss_clip": 1.17165446, + "balance_loss_mlp": 1.02321887, + "epoch": 0.3995671255936993, + "flos": 19392444452160.0, + "grad_norm": 2.2843448261633115, + "language_loss": 0.78427279, + "learning_rate": 2.7305093969584857e-06, + "loss": 0.8124364, + "num_input_tokens_seen": 71653145, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.70507812, + "step": 3323, + "time_per_iteration": 2.9041318893432617 + }, + { + "auxiliary_loss_clip": 0.01530294, + "auxiliary_loss_mlp": 0.013, + "balance_loss_clip": 1.17898655, + "balance_loss_mlp": 1.02953756, + "epoch": 0.3996873684843384, + "flos": 23844951457920.0, + "grad_norm": 3.177573304915415, + "language_loss": 0.80108202, + "learning_rate": 2.729784190922272e-06, + "loss": 0.82938492, + "num_input_tokens_seen": 71674580, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.70703125, + "step": 3324, + "time_per_iteration": 2.989313840866089 + }, + { + "auxiliary_loss_clip": 0.016056, + "auxiliary_loss_mlp": 0.01200371, + "balance_loss_clip": 1.26315355, + "balance_loss_mlp": 0.98140717, + "epoch": 0.39980761137497745, + "flos": 66584155885920.0, + "grad_norm": 0.9558450950467533, + "language_loss": 0.57123345, + "learning_rate": 2.729058874178378e-06, + "loss": 0.59929311, + "num_input_tokens_seen": 71745260, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 2.1953125, + "step": 3325, + "time_per_iteration": 3.5119171142578125 + }, + { + "auxiliary_loss_clip": 0.01534165, + "auxiliary_loss_mlp": 0.0130171, + "balance_loss_clip": 1.18192756, + "balance_loss_mlp": 1.02953112, + "epoch": 0.39992785426561656, + "flos": 28551731398560.0, + "grad_norm": 2.3919260395290203, + "language_loss": 0.69511676, + "learning_rate": 2.7283334468368315e-06, + "loss": 0.72347558, + "num_input_tokens_seen": 71766540, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.72460938, + "step": 3326, + "time_per_iteration": 3.0084805488586426 + }, + { + "auxiliary_loss_clip": 0.01521946, + "auxiliary_loss_mlp": 0.01291231, + "balance_loss_clip": 1.17019093, + "balance_loss_mlp": 1.02019691, + "epoch": 0.4000480971562556, + "flos": 15051219760800.0, + "grad_norm": 2.0212364834835217, + "language_loss": 0.73336262, + "learning_rate": 2.72760790900768e-06, + "loss": 0.7614944, + "num_input_tokens_seen": 71783125, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.71289062, + "step": 3327, + "time_per_iteration": 3.0676159858703613 + }, + { + "auxiliary_loss_clip": 0.0152842, + "auxiliary_loss_mlp": 0.0129431, + "balance_loss_clip": 1.17740273, + "balance_loss_mlp": 1.0251832, + "epoch": 0.4001683400468947, + "flos": 23917925966400.0, + "grad_norm": 1.7273413505996755, + "language_loss": 0.78921908, + "learning_rate": 2.7268822608009875e-06, + "loss": 0.81744641, + "num_input_tokens_seen": 71802500, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.69335938, + "step": 3328, + "time_per_iteration": 3.105414628982544 + }, + { + "auxiliary_loss_clip": 0.01523873, + "auxiliary_loss_mlp": 0.01306012, + "balance_loss_clip": 1.17116392, + "balance_loss_mlp": 1.03497791, + "epoch": 0.40028858293753383, + "flos": 24354976525920.0, + "grad_norm": 2.245476712194061, + "language_loss": 0.7819258, + "learning_rate": 2.726156502326834e-06, + "loss": 0.81022465, + "num_input_tokens_seen": 71823800, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.71289062, + "step": 3329, + "time_per_iteration": 3.1066818237304688 + }, + { + "auxiliary_loss_clip": 0.01617377, + "auxiliary_loss_mlp": 0.01203422, + "balance_loss_clip": 1.27599001, + "balance_loss_mlp": 0.98445892, + "epoch": 0.4004088258281729, + "flos": 66793824940320.0, + "grad_norm": 0.881953515758608, + "language_loss": 0.60235959, + "learning_rate": 2.725430633695316e-06, + "loss": 0.63056761, + "num_input_tokens_seen": 71886880, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 2.1953125, + "step": 3330, + "time_per_iteration": 3.587310552597046 + }, + { + "auxiliary_loss_clip": 0.01618112, + "auxiliary_loss_mlp": 0.01203552, + "balance_loss_clip": 1.27723372, + "balance_loss_mlp": 0.98458862, + "epoch": 0.400529068718812, + "flos": 58603839887520.0, + "grad_norm": 0.8993729093753814, + "language_loss": 0.57914174, + "learning_rate": 2.7247046550165485e-06, + "loss": 0.60735846, + "num_input_tokens_seen": 71939005, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 2.1953125, + "step": 3331, + "time_per_iteration": 3.2649519443511963 + }, + { + "auxiliary_loss_clip": 0.01533466, + "auxiliary_loss_mlp": 0.01314722, + "balance_loss_clip": 1.18166423, + "balance_loss_mlp": 1.04006362, + "epoch": 0.4006493116094511, + "flos": 25379995250880.0, + "grad_norm": 5.66981169298126, + "language_loss": 0.76060414, + "learning_rate": 2.7239785664006606e-06, + "loss": 0.7890861, + "num_input_tokens_seen": 71962545, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.75, + "step": 3332, + "time_per_iteration": 3.108769655227661 + }, + { + "auxiliary_loss_clip": 0.01617159, + "auxiliary_loss_mlp": 0.0123436, + "balance_loss_clip": 1.27601194, + "balance_loss_mlp": 1.0131073, + "epoch": 0.40076955450009016, + "flos": 60286501536480.0, + "grad_norm": 0.7747110598674197, + "language_loss": 0.61714804, + "learning_rate": 2.7232523679578002e-06, + "loss": 0.64566326, + "num_input_tokens_seen": 72025625, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 2.21875, + "step": 3333, + "time_per_iteration": 3.5382275581359863 + }, + { + "auxiliary_loss_clip": 0.01528457, + "auxiliary_loss_mlp": 0.01294966, + "balance_loss_clip": 1.17635345, + "balance_loss_mlp": 1.02469444, + "epoch": 0.4008897973907293, + "flos": 16619109704640.0, + "grad_norm": 2.937165528501343, + "language_loss": 0.79344183, + "learning_rate": 2.7225260597981295e-06, + "loss": 0.82167602, + "num_input_tokens_seen": 72043330, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.70507812, + "step": 3334, + "time_per_iteration": 3.0600950717926025 + }, + { + "auxiliary_loss_clip": 0.01522868, + "auxiliary_loss_mlp": 0.01308541, + "balance_loss_clip": 1.17097962, + "balance_loss_mlp": 1.035218, + "epoch": 0.4010100402813684, + "flos": 15379149911040.0, + "grad_norm": 2.776597273364016, + "language_loss": 0.78675383, + "learning_rate": 2.721799642031831e-06, + "loss": 0.81506789, + "num_input_tokens_seen": 72059500, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.73632812, + "step": 3335, + "time_per_iteration": 3.0658657550811768 + }, + { + "auxiliary_loss_clip": 0.01531026, + "auxiliary_loss_mlp": 0.01306163, + "balance_loss_clip": 1.17862034, + "balance_loss_mlp": 1.03379381, + "epoch": 0.40113028317200744, + "flos": 13300590120480.0, + "grad_norm": 4.507083004127739, + "language_loss": 0.77872622, + "learning_rate": 2.721073114769101e-06, + "loss": 0.80709809, + "num_input_tokens_seen": 72077175, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.7265625, + "step": 3336, + "time_per_iteration": 3.0218491554260254 + }, + { + "auxiliary_loss_clip": 0.01523481, + "auxiliary_loss_mlp": 0.01296348, + "balance_loss_clip": 1.17039227, + "balance_loss_mlp": 1.02683949, + "epoch": 0.40125052606264655, + "flos": 20670143129280.0, + "grad_norm": 1.9606127373847402, + "language_loss": 0.74916917, + "learning_rate": 2.7203464781201523e-06, + "loss": 0.77736747, + "num_input_tokens_seen": 72096490, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.69726562, + "step": 3337, + "time_per_iteration": 3.8312509059906006 + }, + { + "auxiliary_loss_clip": 0.01522155, + "auxiliary_loss_mlp": 0.01305437, + "balance_loss_clip": 1.16921949, + "balance_loss_mlp": 1.03459322, + "epoch": 0.40137076895328566, + "flos": 24609970095840.0, + "grad_norm": 2.5915120752528567, + "language_loss": 0.78126478, + "learning_rate": 2.719619732195215e-06, + "loss": 0.80954075, + "num_input_tokens_seen": 72118130, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.7109375, + "step": 3338, + "time_per_iteration": 3.9178242683410645 + }, + { + "auxiliary_loss_clip": 0.01519867, + "auxiliary_loss_mlp": 0.0129935, + "balance_loss_clip": 1.16743326, + "balance_loss_mlp": 1.02850652, + "epoch": 0.4014910118439247, + "flos": 24208875796320.0, + "grad_norm": 1.5641083681733874, + "language_loss": 0.73119879, + "learning_rate": 2.7188928771045377e-06, + "loss": 0.75939095, + "num_input_tokens_seen": 72139450, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.7109375, + "step": 3339, + "time_per_iteration": 2.995426654815674 + }, + { + "auxiliary_loss_clip": 0.01516173, + "auxiliary_loss_mlp": 0.0129712, + "balance_loss_clip": 1.1645745, + "balance_loss_mlp": 1.02646756, + "epoch": 0.4016112547345638, + "flos": 26727558327360.0, + "grad_norm": 2.130571344188892, + "language_loss": 0.80089235, + "learning_rate": 2.7181659129583815e-06, + "loss": 0.82902527, + "num_input_tokens_seen": 72159040, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.70898438, + "step": 3340, + "time_per_iteration": 3.861380100250244 + }, + { + "auxiliary_loss_clip": 0.01519969, + "auxiliary_loss_mlp": 0.01303746, + "balance_loss_clip": 1.16650259, + "balance_loss_mlp": 1.03175783, + "epoch": 0.4017314976252029, + "flos": 21290160954240.0, + "grad_norm": 2.2009334643757623, + "language_loss": 0.76354474, + "learning_rate": 2.7174388398670276e-06, + "loss": 0.7917819, + "num_input_tokens_seen": 72178220, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.72265625, + "step": 3341, + "time_per_iteration": 2.9653356075286865 + }, + { + "auxiliary_loss_clip": 0.01510631, + "auxiliary_loss_mlp": 0.01297949, + "balance_loss_clip": 1.15701866, + "balance_loss_mlp": 1.02634239, + "epoch": 0.401851740515842, + "flos": 25487105467680.0, + "grad_norm": 2.002210881462642, + "language_loss": 0.9231571, + "learning_rate": 2.716711657940773e-06, + "loss": 0.95124292, + "num_input_tokens_seen": 72199230, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.71875, + "step": 3342, + "time_per_iteration": 4.181603193283081 + }, + { + "auxiliary_loss_clip": 0.01590629, + "auxiliary_loss_mlp": 0.01276413, + "balance_loss_clip": 1.24758935, + "balance_loss_mlp": 1.05439758, + "epoch": 0.4019719834064811, + "flos": 55401040212480.0, + "grad_norm": 0.835870520797883, + "language_loss": 0.56419921, + "learning_rate": 2.7159843672899284e-06, + "loss": 0.59286964, + "num_input_tokens_seen": 72263430, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 2.2265625, + "step": 3343, + "time_per_iteration": 3.5976674556732178 + }, + { + "auxiliary_loss_clip": 0.0152172, + "auxiliary_loss_mlp": 0.01299761, + "balance_loss_clip": 1.16772699, + "balance_loss_mlp": 1.02758265, + "epoch": 0.40209222629712016, + "flos": 18181689706080.0, + "grad_norm": 1.9231151997699383, + "language_loss": 0.81759745, + "learning_rate": 2.715256968024825e-06, + "loss": 0.84581232, + "num_input_tokens_seen": 72280505, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.72460938, + "step": 3344, + "time_per_iteration": 3.0803022384643555 + }, + { + "auxiliary_loss_clip": 0.01516331, + "auxiliary_loss_mlp": 0.01291983, + "balance_loss_clip": 1.16440558, + "balance_loss_mlp": 1.02304673, + "epoch": 0.40221246918775927, + "flos": 25963563749760.0, + "grad_norm": 1.5936904998614878, + "language_loss": 0.82512903, + "learning_rate": 2.7145294602558083e-06, + "loss": 0.85321218, + "num_input_tokens_seen": 72301215, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.69140625, + "step": 3345, + "time_per_iteration": 3.0610766410827637 + }, + { + "auxiliary_loss_clip": 0.01516537, + "auxiliary_loss_mlp": 0.01297658, + "balance_loss_clip": 1.16312504, + "balance_loss_mlp": 1.02338147, + "epoch": 0.4023327120783984, + "flos": 33841738484640.0, + "grad_norm": 1.9076698549498419, + "language_loss": 0.70672715, + "learning_rate": 2.713801844093241e-06, + "loss": 0.73486912, + "num_input_tokens_seen": 72322365, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.74609375, + "step": 3346, + "time_per_iteration": 3.00136399269104 + }, + { + "auxiliary_loss_clip": 0.01517511, + "auxiliary_loss_mlp": 0.01292252, + "balance_loss_clip": 1.16388822, + "balance_loss_mlp": 1.02140892, + "epoch": 0.40245295496903744, + "flos": 26902333110240.0, + "grad_norm": 2.757819728410653, + "language_loss": 0.88100982, + "learning_rate": 2.7130741196475014e-06, + "loss": 0.90910751, + "num_input_tokens_seen": 72340495, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.7109375, + "step": 3347, + "time_per_iteration": 3.0043399333953857 + }, + { + "auxiliary_loss_clip": 0.01511866, + "auxiliary_loss_mlp": 0.01311879, + "balance_loss_clip": 1.15769458, + "balance_loss_mlp": 1.03645825, + "epoch": 0.40257319785967655, + "flos": 36905681708640.0, + "grad_norm": 2.390113869369307, + "language_loss": 0.79332995, + "learning_rate": 2.7123462870289848e-06, + "loss": 0.82156742, + "num_input_tokens_seen": 72360545, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.7578125, + "step": 3348, + "time_per_iteration": 3.0424857139587402 + }, + { + "auxiliary_loss_clip": 0.0151273, + "auxiliary_loss_mlp": 0.01295687, + "balance_loss_clip": 1.15911782, + "balance_loss_mlp": 1.02579689, + "epoch": 0.40269344075031566, + "flos": 24355507520160.0, + "grad_norm": 3.1045186951131676, + "language_loss": 0.81034136, + "learning_rate": 2.711618346348102e-06, + "loss": 0.83842552, + "num_input_tokens_seen": 72381070, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.70117188, + "step": 3349, + "time_per_iteration": 3.1322669982910156 + }, + { + "auxiliary_loss_clip": 0.01518567, + "auxiliary_loss_mlp": 0.01290883, + "balance_loss_clip": 1.16531503, + "balance_loss_mlp": 1.02480769, + "epoch": 0.4028136836409547, + "flos": 14391263219040.0, + "grad_norm": 3.617823332073497, + "language_loss": 0.63505024, + "learning_rate": 2.7108902977152825e-06, + "loss": 0.66314471, + "num_input_tokens_seen": 72398970, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.66210938, + "step": 3350, + "time_per_iteration": 3.1613264083862305 + }, + { + "auxiliary_loss_clip": 0.01514372, + "auxiliary_loss_mlp": 0.01300188, + "balance_loss_clip": 1.15948701, + "balance_loss_mlp": 1.03067994, + "epoch": 0.4029339265315938, + "flos": 26138186820000.0, + "grad_norm": 4.158760449656193, + "language_loss": 0.74925929, + "learning_rate": 2.7101621412409704e-06, + "loss": 0.77740484, + "num_input_tokens_seen": 72418455, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.69726562, + "step": 3351, + "time_per_iteration": 3.0727784633636475 + }, + { + "auxiliary_loss_clip": 0.01515612, + "auxiliary_loss_mlp": 0.01300496, + "balance_loss_clip": 1.16088915, + "balance_loss_mlp": 1.02831769, + "epoch": 0.40305416942223293, + "flos": 23258462490720.0, + "grad_norm": 2.11609909030065, + "language_loss": 0.85884845, + "learning_rate": 2.7094338770356256e-06, + "loss": 0.88700956, + "num_input_tokens_seen": 72437540, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.72460938, + "step": 3352, + "time_per_iteration": 3.0466489791870117 + }, + { + "auxiliary_loss_clip": 0.01512456, + "auxiliary_loss_mlp": 0.01302645, + "balance_loss_clip": 1.15853548, + "balance_loss_mlp": 1.03180158, + "epoch": 0.403174412312872, + "flos": 27092127444480.0, + "grad_norm": 2.0340171684441755, + "language_loss": 0.64261234, + "learning_rate": 2.708705505209726e-06, + "loss": 0.67076337, + "num_input_tokens_seen": 72458315, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.7109375, + "step": 3353, + "time_per_iteration": 2.9887874126434326 + }, + { + "auxiliary_loss_clip": 0.01521944, + "auxiliary_loss_mlp": 0.0129515, + "balance_loss_clip": 1.16747904, + "balance_loss_mlp": 1.02487874, + "epoch": 0.4032946552035111, + "flos": 21758161256640.0, + "grad_norm": 1.9822680371540111, + "language_loss": 0.91756225, + "learning_rate": 2.7079770258737646e-06, + "loss": 0.94573319, + "num_input_tokens_seen": 72476225, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.70507812, + "step": 3354, + "time_per_iteration": 3.081968069076538 + }, + { + "auxiliary_loss_clip": 0.01515161, + "auxiliary_loss_mlp": 0.0130444, + "balance_loss_clip": 1.16081119, + "balance_loss_mlp": 1.03092635, + "epoch": 0.4034148980941502, + "flos": 17345668824000.0, + "grad_norm": 3.2561250490418243, + "language_loss": 0.75133699, + "learning_rate": 2.707248439138251e-06, + "loss": 0.77953297, + "num_input_tokens_seen": 72492460, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.73828125, + "step": 3355, + "time_per_iteration": 3.0343751907348633 + }, + { + "auxiliary_loss_clip": 0.01516795, + "auxiliary_loss_mlp": 0.01297197, + "balance_loss_clip": 1.16402197, + "balance_loss_mlp": 1.02787971, + "epoch": 0.40353514098478926, + "flos": 22020019823520.0, + "grad_norm": 2.234950284242555, + "language_loss": 0.65486157, + "learning_rate": 2.7065197451137114e-06, + "loss": 0.68300152, + "num_input_tokens_seen": 72513840, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.6953125, + "step": 3356, + "time_per_iteration": 2.9945523738861084 + }, + { + "auxiliary_loss_clip": 0.01512357, + "auxiliary_loss_mlp": 0.0129401, + "balance_loss_clip": 1.15787077, + "balance_loss_mlp": 1.02297592, + "epoch": 0.4036553838754284, + "flos": 14248196742240.0, + "grad_norm": 2.3918097757941412, + "language_loss": 0.67163354, + "learning_rate": 2.7057909439106894e-06, + "loss": 0.6996972, + "num_input_tokens_seen": 72531695, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.71289062, + "step": 3357, + "time_per_iteration": 3.0290491580963135 + }, + { + "auxiliary_loss_clip": 0.01515579, + "auxiliary_loss_mlp": 0.01304084, + "balance_loss_clip": 1.16265035, + "balance_loss_mlp": 1.03133285, + "epoch": 0.40377562676606743, + "flos": 24792975289440.0, + "grad_norm": 2.2337768187310374, + "language_loss": 0.78376377, + "learning_rate": 2.7050620356397417e-06, + "loss": 0.81196034, + "num_input_tokens_seen": 72550645, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.73046875, + "step": 3358, + "time_per_iteration": 2.9566924571990967 + }, + { + "auxiliary_loss_clip": 0.01513893, + "auxiliary_loss_mlp": 0.01285714, + "balance_loss_clip": 1.16003954, + "balance_loss_mlp": 1.02021074, + "epoch": 0.40389586965670654, + "flos": 24063533629920.0, + "grad_norm": 2.3231427850973287, + "language_loss": 0.72512102, + "learning_rate": 2.7043330204114437e-06, + "loss": 0.75311708, + "num_input_tokens_seen": 72569355, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.65625, + "step": 3359, + "time_per_iteration": 2.96120023727417 + }, + { + "auxiliary_loss_clip": 0.01513169, + "auxiliary_loss_mlp": 0.01297379, + "balance_loss_clip": 1.16066468, + "balance_loss_mlp": 1.02787054, + "epoch": 0.40401611254734565, + "flos": 16401513664800.0, + "grad_norm": 2.0988498298885903, + "language_loss": 0.85795367, + "learning_rate": 2.7036038983363862e-06, + "loss": 0.88605917, + "num_input_tokens_seen": 72585960, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.69726562, + "step": 3360, + "time_per_iteration": 3.053666353225708 + }, + { + "auxiliary_loss_clip": 0.0151502, + "auxiliary_loss_mlp": 0.0129074, + "balance_loss_clip": 1.16177082, + "balance_loss_mlp": 1.02352071, + "epoch": 0.4041363554379847, + "flos": 23990521193280.0, + "grad_norm": 1.8617373048455215, + "language_loss": 0.84286153, + "learning_rate": 2.702874669525177e-06, + "loss": 0.87091917, + "num_input_tokens_seen": 72604440, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.67382812, + "step": 3361, + "time_per_iteration": 3.2423839569091797 + }, + { + "auxiliary_loss_clip": 0.01520578, + "auxiliary_loss_mlp": 0.01292255, + "balance_loss_clip": 1.16732275, + "balance_loss_mlp": 1.0242722, + "epoch": 0.4042565983286238, + "flos": 28404454896000.0, + "grad_norm": 2.0537455536610807, + "language_loss": 0.69608516, + "learning_rate": 2.7021453340884394e-06, + "loss": 0.72421348, + "num_input_tokens_seen": 72622165, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.68164062, + "step": 3362, + "time_per_iteration": 3.1948020458221436 + }, + { + "auxiliary_loss_clip": 0.01524611, + "auxiliary_loss_mlp": 0.01305059, + "balance_loss_clip": 1.17190862, + "balance_loss_mlp": 1.0344063, + "epoch": 0.40437684121926293, + "flos": 17712817056000.0, + "grad_norm": 2.5506169245037307, + "language_loss": 0.72723526, + "learning_rate": 2.7014158921368125e-06, + "loss": 0.75553203, + "num_input_tokens_seen": 72640490, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.70898438, + "step": 3363, + "time_per_iteration": 3.133255958557129 + }, + { + "auxiliary_loss_clip": 0.01521797, + "auxiliary_loss_mlp": 0.01296122, + "balance_loss_clip": 1.16910696, + "balance_loss_mlp": 1.02566028, + "epoch": 0.404497084109902, + "flos": 24020939941920.0, + "grad_norm": 3.388511599757599, + "language_loss": 0.85538775, + "learning_rate": 2.700686343780953e-06, + "loss": 0.88356698, + "num_input_tokens_seen": 72660360, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.70703125, + "step": 3364, + "time_per_iteration": 3.7891626358032227 + }, + { + "auxiliary_loss_clip": 0.01514794, + "auxiliary_loss_mlp": 0.01306645, + "balance_loss_clip": 1.16135192, + "balance_loss_mlp": 1.03599238, + "epoch": 0.4046173270005411, + "flos": 22932087395040.0, + "grad_norm": 1.8904570559211835, + "language_loss": 0.88097572, + "learning_rate": 2.699956689131532e-06, + "loss": 0.90919006, + "num_input_tokens_seen": 72680345, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.70898438, + "step": 3365, + "time_per_iteration": 3.8493525981903076 + }, + { + "auxiliary_loss_clip": 0.01516273, + "auxiliary_loss_mlp": 0.01297593, + "balance_loss_clip": 1.16349137, + "balance_loss_mlp": 1.02560484, + "epoch": 0.4047375698911802, + "flos": 20670256913760.0, + "grad_norm": 2.232249732254977, + "language_loss": 0.84842896, + "learning_rate": 2.699226928299238e-06, + "loss": 0.87656766, + "num_input_tokens_seen": 72698365, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.72265625, + "step": 3366, + "time_per_iteration": 3.0387518405914307 + }, + { + "auxiliary_loss_clip": 0.01518102, + "auxiliary_loss_mlp": 0.01298121, + "balance_loss_clip": 1.16547537, + "balance_loss_mlp": 1.02880287, + "epoch": 0.40485781278181926, + "flos": 28915390239840.0, + "grad_norm": 2.594021618202823, + "language_loss": 0.79173303, + "learning_rate": 2.698497061394774e-06, + "loss": 0.81989527, + "num_input_tokens_seen": 72716850, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.6953125, + "step": 3367, + "time_per_iteration": 2.979668140411377 + }, + { + "auxiliary_loss_clip": 0.01520516, + "auxiliary_loss_mlp": 0.0129122, + "balance_loss_clip": 1.16877258, + "balance_loss_mlp": 1.02094913, + "epoch": 0.40497805567245837, + "flos": 23150593710720.0, + "grad_norm": 2.7264712751846387, + "language_loss": 0.80817628, + "learning_rate": 2.6977670885288627e-06, + "loss": 0.8362937, + "num_input_tokens_seen": 72738250, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.70507812, + "step": 3368, + "time_per_iteration": 3.8384997844696045 + }, + { + "auxiliary_loss_clip": 0.01519799, + "auxiliary_loss_mlp": 0.0128308, + "balance_loss_clip": 1.16658664, + "balance_loss_mlp": 1.01547897, + "epoch": 0.4050982985630975, + "flos": 16291255410720.0, + "grad_norm": 1.9201005368761657, + "language_loss": 0.75572878, + "learning_rate": 2.6970370098122378e-06, + "loss": 0.78375757, + "num_input_tokens_seen": 72755235, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.67773438, + "step": 3369, + "time_per_iteration": 3.0735459327697754 + }, + { + "auxiliary_loss_clip": 0.01524444, + "auxiliary_loss_mlp": 0.01292051, + "balance_loss_clip": 1.1712966, + "balance_loss_mlp": 1.02349663, + "epoch": 0.40521854145373654, + "flos": 34462363160160.0, + "grad_norm": 2.0991933154792752, + "language_loss": 0.86312437, + "learning_rate": 2.6963068253556535e-06, + "loss": 0.89128935, + "num_input_tokens_seen": 72776620, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.6875, + "step": 3370, + "time_per_iteration": 3.9549412727355957 + }, + { + "auxiliary_loss_clip": 0.01523699, + "auxiliary_loss_mlp": 0.01295121, + "balance_loss_clip": 1.17086089, + "balance_loss_mlp": 1.02504003, + "epoch": 0.40533878434437565, + "flos": 25333077752640.0, + "grad_norm": 2.012127532976758, + "language_loss": 0.85539001, + "learning_rate": 2.6955765352698763e-06, + "loss": 0.88357824, + "num_input_tokens_seen": 72796765, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.703125, + "step": 3371, + "time_per_iteration": 3.0454373359680176 + }, + { + "auxiliary_loss_clip": 0.01526415, + "auxiliary_loss_mlp": 0.01297803, + "balance_loss_clip": 1.17337191, + "balance_loss_mlp": 1.0254333, + "epoch": 0.40545902723501476, + "flos": 15013822230720.0, + "grad_norm": 2.3848994229371585, + "language_loss": 0.72812748, + "learning_rate": 2.6948461396656923e-06, + "loss": 0.75636971, + "num_input_tokens_seen": 72814175, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.7265625, + "step": 3372, + "time_per_iteration": 3.0280230045318604 + }, + { + "auxiliary_loss_clip": 0.01524357, + "auxiliary_loss_mlp": 0.01302217, + "balance_loss_clip": 1.1704905, + "balance_loss_mlp": 1.0290848, + "epoch": 0.4055792701256538, + "flos": 25523516865600.0, + "grad_norm": 9.734258044298539, + "language_loss": 0.74429858, + "learning_rate": 2.6941156386539013e-06, + "loss": 0.77256435, + "num_input_tokens_seen": 72834125, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.734375, + "step": 3373, + "time_per_iteration": 3.060826063156128 + }, + { + "auxiliary_loss_clip": 0.01531673, + "auxiliary_loss_mlp": 0.01312037, + "balance_loss_clip": 1.17774534, + "balance_loss_mlp": 1.03966808, + "epoch": 0.4056995130162929, + "flos": 19576701275040.0, + "grad_norm": 4.337492449398731, + "language_loss": 0.80927128, + "learning_rate": 2.6933850323453203e-06, + "loss": 0.83770835, + "num_input_tokens_seen": 72852570, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.7265625, + "step": 3374, + "time_per_iteration": 3.0767834186553955 + }, + { + "auxiliary_loss_clip": 0.01533927, + "auxiliary_loss_mlp": 0.01289148, + "balance_loss_clip": 1.18160129, + "balance_loss_mlp": 1.02288175, + "epoch": 0.405819755906932, + "flos": 15415599237120.0, + "grad_norm": 2.2955651656387466, + "language_loss": 0.74976152, + "learning_rate": 2.6926543208507806e-06, + "loss": 0.77799225, + "num_input_tokens_seen": 72871250, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.6640625, + "step": 3375, + "time_per_iteration": 3.0887296199798584 + }, + { + "auxiliary_loss_clip": 0.01529074, + "auxiliary_loss_mlp": 0.01296433, + "balance_loss_clip": 1.17524981, + "balance_loss_mlp": 1.0267334, + "epoch": 0.4059399987975711, + "flos": 21436072043040.0, + "grad_norm": 2.263248278393505, + "language_loss": 0.80244386, + "learning_rate": 2.6919235042811316e-06, + "loss": 0.83069885, + "num_input_tokens_seen": 72890035, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.69921875, + "step": 3376, + "time_per_iteration": 2.957846164703369 + }, + { + "auxiliary_loss_clip": 0.0152827, + "auxiliary_loss_mlp": 0.01293596, + "balance_loss_clip": 1.17511415, + "balance_loss_mlp": 1.02179909, + "epoch": 0.4060602416882102, + "flos": 25559321412960.0, + "grad_norm": 2.5854849414343906, + "language_loss": 0.76621938, + "learning_rate": 2.691192582747237e-06, + "loss": 0.794438, + "num_input_tokens_seen": 72909665, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.72070312, + "step": 3377, + "time_per_iteration": 2.9516592025756836 + }, + { + "auxiliary_loss_clip": 0.01534059, + "auxiliary_loss_mlp": 0.01299039, + "balance_loss_clip": 1.18126833, + "balance_loss_mlp": 1.02819526, + "epoch": 0.40618048457884925, + "flos": 23768904768480.0, + "grad_norm": 2.4161792802438455, + "language_loss": 0.74234676, + "learning_rate": 2.6904615563599765e-06, + "loss": 0.77067769, + "num_input_tokens_seen": 72929465, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.7109375, + "step": 3378, + "time_per_iteration": 2.9055330753326416 + }, + { + "auxiliary_loss_clip": 0.015357, + "auxiliary_loss_mlp": 0.01301653, + "balance_loss_clip": 1.18190467, + "balance_loss_mlp": 1.0315721, + "epoch": 0.40630072746948837, + "flos": 17641625171040.0, + "grad_norm": 1.9088482680372558, + "language_loss": 0.83273041, + "learning_rate": 2.6897304252302477e-06, + "loss": 0.86110389, + "num_input_tokens_seen": 72946785, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.703125, + "step": 3379, + "time_per_iteration": 2.9509384632110596 + }, + { + "auxiliary_loss_clip": 0.0158763, + "auxiliary_loss_mlp": 0.01232666, + "balance_loss_clip": 1.24458337, + "balance_loss_mlp": 1.01904297, + "epoch": 0.4064209703601275, + "flos": 60842306257920.0, + "grad_norm": 0.9045569064590538, + "language_loss": 0.54717278, + "learning_rate": 2.688999189468962e-06, + "loss": 0.57537574, + "num_input_tokens_seen": 73003215, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 2.140625, + "step": 3380, + "time_per_iteration": 3.2962775230407715 + }, + { + "auxiliary_loss_clip": 0.01536319, + "auxiliary_loss_mlp": 0.01297202, + "balance_loss_clip": 1.18373525, + "balance_loss_mlp": 1.02654958, + "epoch": 0.40654121325076653, + "flos": 24026629165920.0, + "grad_norm": 2.774437268311373, + "language_loss": 0.76013625, + "learning_rate": 2.6882678491870464e-06, + "loss": 0.7884714, + "num_input_tokens_seen": 73023650, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.70898438, + "step": 3381, + "time_per_iteration": 2.9351277351379395 + }, + { + "auxiliary_loss_clip": 0.01529456, + "auxiliary_loss_mlp": 0.01311715, + "balance_loss_clip": 1.17455173, + "balance_loss_mlp": 1.03915524, + "epoch": 0.40666145614140564, + "flos": 27346703804640.0, + "grad_norm": 1.9801477272351293, + "language_loss": 0.71119249, + "learning_rate": 2.6875364044954453e-06, + "loss": 0.73960418, + "num_input_tokens_seen": 73043880, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.72851562, + "step": 3382, + "time_per_iteration": 2.9996230602264404 + }, + { + "auxiliary_loss_clip": 0.0152863, + "auxiliary_loss_mlp": 0.01308052, + "balance_loss_clip": 1.17501259, + "balance_loss_mlp": 1.03873467, + "epoch": 0.40678169903204475, + "flos": 26179111668960.0, + "grad_norm": 1.7245654724866515, + "language_loss": 0.82407629, + "learning_rate": 2.6868048555051185e-06, + "loss": 0.8524431, + "num_input_tokens_seen": 73065410, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.6953125, + "step": 3383, + "time_per_iteration": 3.0302813053131104 + }, + { + "auxiliary_loss_clip": 0.01529345, + "auxiliary_loss_mlp": 0.01304924, + "balance_loss_clip": 1.17626095, + "balance_loss_mlp": 1.03732264, + "epoch": 0.4069019419226838, + "flos": 28625009332320.0, + "grad_norm": 2.2885130449403635, + "language_loss": 0.85568631, + "learning_rate": 2.686073202327041e-06, + "loss": 0.88402903, + "num_input_tokens_seen": 73084410, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.67773438, + "step": 3384, + "time_per_iteration": 3.1019532680511475 + }, + { + "auxiliary_loss_clip": 0.01530257, + "auxiliary_loss_mlp": 0.01319711, + "balance_loss_clip": 1.17737949, + "balance_loss_mlp": 1.04943931, + "epoch": 0.4070221848133229, + "flos": 25231391262720.0, + "grad_norm": 1.9448497937436227, + "language_loss": 0.73707312, + "learning_rate": 2.6853414450722043e-06, + "loss": 0.76557279, + "num_input_tokens_seen": 73104075, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.70507812, + "step": 3385, + "time_per_iteration": 3.0485143661499023 + }, + { + "auxiliary_loss_clip": 0.01521348, + "auxiliary_loss_mlp": 0.01313028, + "balance_loss_clip": 1.16918063, + "balance_loss_mlp": 1.0425663, + "epoch": 0.40714242770396203, + "flos": 18407174803200.0, + "grad_norm": 2.2805454895619337, + "language_loss": 0.85162485, + "learning_rate": 2.684609583851616e-06, + "loss": 0.87996858, + "num_input_tokens_seen": 73122250, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.70703125, + "step": 3386, + "time_per_iteration": 2.978754997253418 + }, + { + "auxiliary_loss_clip": 0.01527098, + "auxiliary_loss_mlp": 0.0131885, + "balance_loss_clip": 1.17408323, + "balance_loss_mlp": 1.048769, + "epoch": 0.4072626705946011, + "flos": 30231662220000.0, + "grad_norm": 2.7951920460396646, + "language_loss": 0.80495012, + "learning_rate": 2.683877618776297e-06, + "loss": 0.83340967, + "num_input_tokens_seen": 73144505, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.703125, + "step": 3387, + "time_per_iteration": 3.0318989753723145 + }, + { + "auxiliary_loss_clip": 0.01528976, + "auxiliary_loss_mlp": 0.01304405, + "balance_loss_clip": 1.17593312, + "balance_loss_mlp": 1.03317988, + "epoch": 0.4073829134852402, + "flos": 21836673276480.0, + "grad_norm": 3.648408930826029, + "language_loss": 0.74201417, + "learning_rate": 2.6831455499572876e-06, + "loss": 0.77034801, + "num_input_tokens_seen": 73162440, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.71484375, + "step": 3388, + "time_per_iteration": 2.9353976249694824 + }, + { + "auxiliary_loss_clip": 0.01526768, + "auxiliary_loss_mlp": 0.01309348, + "balance_loss_clip": 1.17428088, + "balance_loss_mlp": 1.03850424, + "epoch": 0.40750315637587925, + "flos": 25262303077440.0, + "grad_norm": 2.1936776062669607, + "language_loss": 0.77799022, + "learning_rate": 2.682413377505641e-06, + "loss": 0.8063513, + "num_input_tokens_seen": 73181245, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.7109375, + "step": 3389, + "time_per_iteration": 3.083224296569824 + }, + { + "auxiliary_loss_clip": 0.01520659, + "auxiliary_loss_mlp": 0.01315711, + "balance_loss_clip": 1.16706514, + "balance_loss_mlp": 1.04315138, + "epoch": 0.40762339926651836, + "flos": 19714457809440.0, + "grad_norm": 3.5581965736742776, + "language_loss": 0.7654171, + "learning_rate": 2.6816811015324284e-06, + "loss": 0.7937808, + "num_input_tokens_seen": 73199295, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.72851562, + "step": 3390, + "time_per_iteration": 3.045943021774292 + }, + { + "auxiliary_loss_clip": 0.01570597, + "auxiliary_loss_mlp": 0.01472504, + "balance_loss_clip": 1.22952795, + "balance_loss_mlp": 1.23599243, + "epoch": 0.40774364215715747, + "flos": 71455925144160.0, + "grad_norm": 2.309019590020576, + "language_loss": 0.56606871, + "learning_rate": 2.6809487221487343e-06, + "loss": 0.59649968, + "num_input_tokens_seen": 73258780, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 2.359375, + "step": 3391, + "time_per_iteration": 3.5262601375579834 + }, + { + "auxiliary_loss_clip": 0.01521743, + "auxiliary_loss_mlp": 0.01326782, + "balance_loss_clip": 1.16975904, + "balance_loss_mlp": 1.05193329, + "epoch": 0.4078638850477965, + "flos": 15085848535200.0, + "grad_norm": 4.379216413098749, + "language_loss": 0.81736487, + "learning_rate": 2.6802162394656605e-06, + "loss": 0.84585011, + "num_input_tokens_seen": 73275490, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.75195312, + "step": 3392, + "time_per_iteration": 3.820000171661377 + }, + { + "auxiliary_loss_clip": 0.01520305, + "auxiliary_loss_mlp": 0.01320361, + "balance_loss_clip": 1.1683023, + "balance_loss_mlp": 1.04780078, + "epoch": 0.40798412793843564, + "flos": 23845216955040.0, + "grad_norm": 1.946065880849697, + "language_loss": 0.71377409, + "learning_rate": 2.679483653594324e-06, + "loss": 0.74218071, + "num_input_tokens_seen": 73297260, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.72851562, + "step": 3393, + "time_per_iteration": 3.916717529296875 + }, + { + "auxiliary_loss_clip": 0.01520063, + "auxiliary_loss_mlp": 0.01303688, + "balance_loss_clip": 1.1669153, + "balance_loss_mlp": 1.03456163, + "epoch": 0.40810437082907475, + "flos": 21067710109920.0, + "grad_norm": 2.544899481678666, + "language_loss": 0.76611596, + "learning_rate": 2.678750964645857e-06, + "loss": 0.79435349, + "num_input_tokens_seen": 73316340, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.69335938, + "step": 3394, + "time_per_iteration": 3.048532009124756 + }, + { + "auxiliary_loss_clip": 0.01522714, + "auxiliary_loss_mlp": 0.01311617, + "balance_loss_clip": 1.17001557, + "balance_loss_mlp": 1.03867507, + "epoch": 0.4082246137197138, + "flos": 11322616903200.0, + "grad_norm": 2.913347668813051, + "language_loss": 0.83593023, + "learning_rate": 2.6780181727314094e-06, + "loss": 0.86427355, + "num_input_tokens_seen": 73331245, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.73242188, + "step": 3395, + "time_per_iteration": 3.83762264251709 + }, + { + "auxiliary_loss_clip": 0.01522112, + "auxiliary_loss_mlp": 0.01295141, + "balance_loss_clip": 1.17009652, + "balance_loss_mlp": 1.02906609, + "epoch": 0.4083448566103529, + "flos": 19064590158240.0, + "grad_norm": 2.111371893766213, + "language_loss": 0.77992094, + "learning_rate": 2.6772852779621435e-06, + "loss": 0.80809343, + "num_input_tokens_seen": 73349105, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.66210938, + "step": 3396, + "time_per_iteration": 3.0422916412353516 + }, + { + "auxiliary_loss_clip": 0.01532944, + "auxiliary_loss_mlp": 0.01294497, + "balance_loss_clip": 1.18175876, + "balance_loss_mlp": 1.02613342, + "epoch": 0.408465099500992, + "flos": 23552522429760.0, + "grad_norm": 10.472343455052869, + "language_loss": 0.87243426, + "learning_rate": 2.676552280449239e-06, + "loss": 0.90070873, + "num_input_tokens_seen": 73368990, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.68554688, + "step": 3397, + "time_per_iteration": 3.831381320953369 + }, + { + "auxiliary_loss_clip": 0.01532346, + "auxiliary_loss_mlp": 0.01299755, + "balance_loss_clip": 1.17968071, + "balance_loss_mlp": 1.0298655, + "epoch": 0.4085853423916311, + "flos": 12751650396000.0, + "grad_norm": 3.7868873384328983, + "language_loss": 0.75521624, + "learning_rate": 2.6758191803038917e-06, + "loss": 0.78353727, + "num_input_tokens_seen": 73387485, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.70117188, + "step": 3398, + "time_per_iteration": 3.0474419593811035 + }, + { + "auxiliary_loss_clip": 0.01531308, + "auxiliary_loss_mlp": 0.01306209, + "balance_loss_clip": 1.1789906, + "balance_loss_mlp": 1.0340302, + "epoch": 0.4087055852822702, + "flos": 24355393735680.0, + "grad_norm": 1.6464630633035944, + "language_loss": 0.82369661, + "learning_rate": 2.6750859776373125e-06, + "loss": 0.85207188, + "num_input_tokens_seen": 73406940, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.72460938, + "step": 3399, + "time_per_iteration": 2.991433620452881 + }, + { + "auxiliary_loss_clip": 0.01580542, + "auxiliary_loss_mlp": 0.01291023, + "balance_loss_clip": 1.24087119, + "balance_loss_mlp": 1.06443024, + "epoch": 0.4088258281729093, + "flos": 66394854617760.0, + "grad_norm": 0.7927801106756757, + "language_loss": 0.60323155, + "learning_rate": 2.674352672560727e-06, + "loss": 0.63194716, + "num_input_tokens_seen": 73468385, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 2.265625, + "step": 3400, + "time_per_iteration": 3.5221221446990967 + }, + { + "auxiliary_loss_clip": 0.01535018, + "auxiliary_loss_mlp": 0.01293928, + "balance_loss_clip": 1.18349719, + "balance_loss_mlp": 1.02899694, + "epoch": 0.40894607106354836, + "flos": 20451371316480.0, + "grad_norm": 1.991139912917422, + "language_loss": 0.77080113, + "learning_rate": 2.673619265185377e-06, + "loss": 0.79909056, + "num_input_tokens_seen": 73488225, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.65039062, + "step": 3401, + "time_per_iteration": 2.9703898429870605 + }, + { + "auxiliary_loss_clip": 0.01534113, + "auxiliary_loss_mlp": 0.01292994, + "balance_loss_clip": 1.18179381, + "balance_loss_mlp": 1.02215004, + "epoch": 0.40906631395418747, + "flos": 27056133256320.0, + "grad_norm": 1.8642005535751793, + "language_loss": 0.77912009, + "learning_rate": 2.672885755622521e-06, + "loss": 0.80739117, + "num_input_tokens_seen": 73510640, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.7109375, + "step": 3402, + "time_per_iteration": 2.9899182319641113 + }, + { + "auxiliary_loss_clip": 0.01526138, + "auxiliary_loss_mlp": 0.01291596, + "balance_loss_clip": 1.17619252, + "balance_loss_mlp": 1.02380419, + "epoch": 0.4091865568448266, + "flos": 25486536545280.0, + "grad_norm": 2.1756799995024063, + "language_loss": 0.70466304, + "learning_rate": 2.67215214398343e-06, + "loss": 0.73284042, + "num_input_tokens_seen": 73530655, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 2.6796875, + "step": 3403, + "time_per_iteration": 2.9614334106445312 + }, + { + "auxiliary_loss_clip": 0.01527893, + "auxiliary_loss_mlp": 0.01306581, + "balance_loss_clip": 1.17650414, + "balance_loss_mlp": 1.03726315, + "epoch": 0.40930679973546563, + "flos": 28660396669920.0, + "grad_norm": 2.828378739896613, + "language_loss": 0.78757322, + "learning_rate": 2.671418430379393e-06, + "loss": 0.81591791, + "num_input_tokens_seen": 73549340, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.6953125, + "step": 3404, + "time_per_iteration": 3.041090726852417 + }, + { + "auxiliary_loss_clip": 0.01532451, + "auxiliary_loss_mlp": 0.01290356, + "balance_loss_clip": 1.18253875, + "balance_loss_mlp": 1.02065659, + "epoch": 0.40942704262610474, + "flos": 20888383947840.0, + "grad_norm": 2.3813477951268984, + "language_loss": 0.83459473, + "learning_rate": 2.670684614921715e-06, + "loss": 0.86282277, + "num_input_tokens_seen": 73568315, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 2.69921875, + "step": 3405, + "time_per_iteration": 2.9972164630889893 + }, + { + "auxiliary_loss_clip": 0.01533564, + "auxiliary_loss_mlp": 0.01306751, + "balance_loss_clip": 1.18187726, + "balance_loss_mlp": 1.04124761, + "epoch": 0.4095472855167438, + "flos": 21620328865920.0, + "grad_norm": 3.0067279227670527, + "language_loss": 0.69047129, + "learning_rate": 2.6699506977217128e-06, + "loss": 0.71887445, + "num_input_tokens_seen": 73588490, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.65625, + "step": 3406, + "time_per_iteration": 3.0323894023895264 + }, + { + "auxiliary_loss_clip": 0.01539801, + "auxiliary_loss_mlp": 0.01298421, + "balance_loss_clip": 1.18878889, + "balance_loss_mlp": 1.03291857, + "epoch": 0.4096675284073829, + "flos": 27928489680000.0, + "grad_norm": 4.660597020624744, + "language_loss": 0.69789088, + "learning_rate": 2.6692166788907233e-06, + "loss": 0.72627318, + "num_input_tokens_seen": 73608685, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.65625, + "step": 3407, + "time_per_iteration": 3.1349079608917236 + }, + { + "auxiliary_loss_clip": 0.01530938, + "auxiliary_loss_mlp": 0.0130766, + "balance_loss_clip": 1.18011272, + "balance_loss_mlp": 1.03757977, + "epoch": 0.409787771298022, + "flos": 19208528982720.0, + "grad_norm": 2.236738132125722, + "language_loss": 0.76713955, + "learning_rate": 2.6684825585400957e-06, + "loss": 0.79552555, + "num_input_tokens_seen": 73627630, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.703125, + "step": 3408, + "time_per_iteration": 2.999859094619751 + }, + { + "auxiliary_loss_clip": 0.01576415, + "auxiliary_loss_mlp": 0.01204811, + "balance_loss_clip": 1.23899806, + "balance_loss_mlp": 0.98966217, + "epoch": 0.4099080141886611, + "flos": 59275516230720.0, + "grad_norm": 0.8199873821345294, + "language_loss": 0.65112412, + "learning_rate": 2.6677483367811947e-06, + "loss": 0.67893636, + "num_input_tokens_seen": 73687670, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 2.15625, + "step": 3409, + "time_per_iteration": 3.542165756225586 + }, + { + "auxiliary_loss_clip": 0.01516863, + "auxiliary_loss_mlp": 0.01299409, + "balance_loss_clip": 1.16470993, + "balance_loss_mlp": 1.03199828, + "epoch": 0.4100282570793002, + "flos": 21908509940160.0, + "grad_norm": 1.9450338449009243, + "language_loss": 0.75722229, + "learning_rate": 2.6670140137254028e-06, + "loss": 0.78538501, + "num_input_tokens_seen": 73707145, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.67578125, + "step": 3410, + "time_per_iteration": 2.993360996246338 + }, + { + "auxiliary_loss_clip": 0.01529127, + "auxiliary_loss_mlp": 0.01293597, + "balance_loss_clip": 1.17756343, + "balance_loss_mlp": 1.02599609, + "epoch": 0.4101484999699393, + "flos": 18553578958080.0, + "grad_norm": 4.822080877213759, + "language_loss": 0.89281726, + "learning_rate": 2.666279589484115e-06, + "loss": 0.92104453, + "num_input_tokens_seen": 73725045, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.67773438, + "step": 3411, + "time_per_iteration": 2.990745782852173 + }, + { + "auxiliary_loss_clip": 0.01528872, + "auxiliary_loss_mlp": 0.01290074, + "balance_loss_clip": 1.17872477, + "balance_loss_mlp": 1.02037466, + "epoch": 0.41026874286057835, + "flos": 19096298464320.0, + "grad_norm": 2.2329761908790693, + "language_loss": 0.81277883, + "learning_rate": 2.6655450641687435e-06, + "loss": 0.84096825, + "num_input_tokens_seen": 73742610, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 2.69921875, + "step": 3412, + "time_per_iteration": 3.0149128437042236 + }, + { + "auxiliary_loss_clip": 0.01531543, + "auxiliary_loss_mlp": 0.01287193, + "balance_loss_clip": 1.18055177, + "balance_loss_mlp": 1.02092743, + "epoch": 0.41038898575121746, + "flos": 31212114992640.0, + "grad_norm": 3.1015613901277255, + "language_loss": 0.69230753, + "learning_rate": 2.664810437890715e-06, + "loss": 0.72049487, + "num_input_tokens_seen": 73764280, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 2.6640625, + "step": 3413, + "time_per_iteration": 2.974412441253662 + }, + { + "auxiliary_loss_clip": 0.01535539, + "auxiliary_loss_mlp": 0.01291823, + "balance_loss_clip": 1.18379545, + "balance_loss_mlp": 1.0240314, + "epoch": 0.41050922864185657, + "flos": 14357620576800.0, + "grad_norm": 2.077128079455394, + "language_loss": 0.7974686, + "learning_rate": 2.6640757107614714e-06, + "loss": 0.82574224, + "num_input_tokens_seen": 73782375, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.6796875, + "step": 3414, + "time_per_iteration": 2.9315266609191895 + }, + { + "auxiliary_loss_clip": 0.01524942, + "auxiliary_loss_mlp": 0.01290534, + "balance_loss_clip": 1.1725508, + "balance_loss_mlp": 1.02083445, + "epoch": 0.4106294715324956, + "flos": 30958941974400.0, + "grad_norm": 2.908684981907495, + "language_loss": 0.68739951, + "learning_rate": 2.6633408828924697e-06, + "loss": 0.71555424, + "num_input_tokens_seen": 73801240, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.69921875, + "step": 3415, + "time_per_iteration": 3.0000572204589844 + }, + { + "auxiliary_loss_clip": 0.01529875, + "auxiliary_loss_mlp": 0.01297457, + "balance_loss_clip": 1.17875493, + "balance_loss_mlp": 1.02833033, + "epoch": 0.41074971442313474, + "flos": 24459355915200.0, + "grad_norm": 1.646808077053443, + "language_loss": 0.6995948, + "learning_rate": 2.662605954395185e-06, + "loss": 0.72786808, + "num_input_tokens_seen": 73821200, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.69335938, + "step": 3416, + "time_per_iteration": 2.9881339073181152 + }, + { + "auxiliary_loss_clip": 0.01518013, + "auxiliary_loss_mlp": 0.01291621, + "balance_loss_clip": 1.16434836, + "balance_loss_mlp": 1.02516484, + "epoch": 0.41086995731377385, + "flos": 21545116596000.0, + "grad_norm": 1.8062709488540531, + "language_loss": 0.83762008, + "learning_rate": 2.6618709253811027e-06, + "loss": 0.86571646, + "num_input_tokens_seen": 73840655, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.66601562, + "step": 3417, + "time_per_iteration": 3.009624719619751 + }, + { + "auxiliary_loss_clip": 0.01522497, + "auxiliary_loss_mlp": 0.0130455, + "balance_loss_clip": 1.17086267, + "balance_loss_mlp": 1.03790331, + "epoch": 0.4109902002044129, + "flos": 20704771903680.0, + "grad_norm": 1.7863463540233364, + "language_loss": 0.87659264, + "learning_rate": 2.6611357959617277e-06, + "loss": 0.90486306, + "num_input_tokens_seen": 73860275, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.66796875, + "step": 3418, + "time_per_iteration": 3.0775375366210938 + }, + { + "auxiliary_loss_clip": 0.01517786, + "auxiliary_loss_mlp": 0.01299243, + "balance_loss_clip": 1.16449332, + "balance_loss_mlp": 1.02859044, + "epoch": 0.411110443095052, + "flos": 18181575921600.0, + "grad_norm": 1.9617915069021972, + "language_loss": 0.91177535, + "learning_rate": 2.660400566248578e-06, + "loss": 0.93994564, + "num_input_tokens_seen": 73878400, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.70898438, + "step": 3419, + "time_per_iteration": 3.8465960025787354 + }, + { + "auxiliary_loss_clip": 0.01518326, + "auxiliary_loss_mlp": 0.01307125, + "balance_loss_clip": 1.16525841, + "balance_loss_mlp": 1.03551817, + "epoch": 0.41123068598569107, + "flos": 14577037168320.0, + "grad_norm": 4.992086864980212, + "language_loss": 0.6686523, + "learning_rate": 2.6596652363531876e-06, + "loss": 0.69690686, + "num_input_tokens_seen": 73894275, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.71875, + "step": 3420, + "time_per_iteration": 3.8366544246673584 + }, + { + "auxiliary_loss_clip": 0.01525632, + "auxiliary_loss_mlp": 0.01303698, + "balance_loss_clip": 1.17198348, + "balance_loss_mlp": 1.03476191, + "epoch": 0.4113509288763302, + "flos": 21180092340960.0, + "grad_norm": 1.6129752671788007, + "language_loss": 0.78301179, + "learning_rate": 2.6589298063871055e-06, + "loss": 0.81130511, + "num_input_tokens_seen": 73914450, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.69140625, + "step": 3421, + "time_per_iteration": 3.0105769634246826 + }, + { + "auxiliary_loss_clip": 0.01519033, + "auxiliary_loss_mlp": 0.01311622, + "balance_loss_clip": 1.1660409, + "balance_loss_mlp": 1.0413506, + "epoch": 0.4114711717669693, + "flos": 18444193051680.0, + "grad_norm": 3.5460068228775556, + "language_loss": 0.69867212, + "learning_rate": 2.658194276461895e-06, + "loss": 0.72697866, + "num_input_tokens_seen": 73932375, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.70507812, + "step": 3422, + "time_per_iteration": 3.7640445232391357 + }, + { + "auxiliary_loss_clip": 0.01513051, + "auxiliary_loss_mlp": 0.01304621, + "balance_loss_clip": 1.15939879, + "balance_loss_mlp": 1.03072596, + "epoch": 0.41159141465760835, + "flos": 27235762843680.0, + "grad_norm": 2.335835572395433, + "language_loss": 0.67125976, + "learning_rate": 2.6574586466891368e-06, + "loss": 0.69943655, + "num_input_tokens_seen": 73952850, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.7421875, + "step": 3423, + "time_per_iteration": 3.0649120807647705 + }, + { + "auxiliary_loss_clip": 0.01513529, + "auxiliary_loss_mlp": 0.01294808, + "balance_loss_clip": 1.15920424, + "balance_loss_mlp": 1.02663505, + "epoch": 0.41171165754824746, + "flos": 20008707389280.0, + "grad_norm": 2.1834886754564047, + "language_loss": 0.64992219, + "learning_rate": 2.6567229171804247e-06, + "loss": 0.67800558, + "num_input_tokens_seen": 73970735, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.68359375, + "step": 3424, + "time_per_iteration": 3.052429676055908 + }, + { + "auxiliary_loss_clip": 0.01512894, + "auxiliary_loss_mlp": 0.01308926, + "balance_loss_clip": 1.16178679, + "balance_loss_mlp": 1.03750992, + "epoch": 0.41183190043888657, + "flos": 18006270144480.0, + "grad_norm": 4.20586585356967, + "language_loss": 0.88097858, + "learning_rate": 2.655987088047368e-06, + "loss": 0.90919679, + "num_input_tokens_seen": 73989080, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.71679688, + "step": 3425, + "time_per_iteration": 3.8361473083496094 + }, + { + "auxiliary_loss_clip": 0.01519507, + "auxiliary_loss_mlp": 0.01305756, + "balance_loss_clip": 1.16769338, + "balance_loss_mlp": 1.03357697, + "epoch": 0.4119521433295256, + "flos": 27165898444320.0, + "grad_norm": 2.0254538447641637, + "language_loss": 0.79038286, + "learning_rate": 2.6552511594015912e-06, + "loss": 0.81863546, + "num_input_tokens_seen": 74009470, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.72460938, + "step": 3426, + "time_per_iteration": 3.0016300678253174 + }, + { + "auxiliary_loss_clip": 0.01515792, + "auxiliary_loss_mlp": 0.01310448, + "balance_loss_clip": 1.16398478, + "balance_loss_mlp": 1.03693426, + "epoch": 0.41207238622016473, + "flos": 15123814987680.0, + "grad_norm": 2.761412680043967, + "language_loss": 0.85294694, + "learning_rate": 2.654515131354735e-06, + "loss": 0.88120931, + "num_input_tokens_seen": 74027735, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.73828125, + "step": 3427, + "time_per_iteration": 3.05366587638855 + }, + { + "auxiliary_loss_clip": 0.01525978, + "auxiliary_loss_mlp": 0.01293759, + "balance_loss_clip": 1.1739912, + "balance_loss_mlp": 1.02463186, + "epoch": 0.41219262911080384, + "flos": 27055336764960.0, + "grad_norm": 2.0395653290716362, + "language_loss": 0.84926295, + "learning_rate": 2.653779004018453e-06, + "loss": 0.87746036, + "num_input_tokens_seen": 74048300, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.69335938, + "step": 3428, + "time_per_iteration": 2.9899027347564697 + }, + { + "auxiliary_loss_clip": 0.01529508, + "auxiliary_loss_mlp": 0.01295517, + "balance_loss_clip": 1.17669976, + "balance_loss_mlp": 1.02581787, + "epoch": 0.4123128720014429, + "flos": 24688975181760.0, + "grad_norm": 2.56281783081401, + "language_loss": 0.82720453, + "learning_rate": 2.653042777504417e-06, + "loss": 0.8554548, + "num_input_tokens_seen": 74070890, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.69921875, + "step": 3429, + "time_per_iteration": 3.1230080127716064 + }, + { + "auxiliary_loss_clip": 0.01520288, + "auxiliary_loss_mlp": 0.01294436, + "balance_loss_clip": 1.16739988, + "balance_loss_mlp": 1.02244806, + "epoch": 0.412433114892082, + "flos": 26246890019520.0, + "grad_norm": 1.867661811574604, + "language_loss": 0.79809135, + "learning_rate": 2.6523064519243105e-06, + "loss": 0.82623857, + "num_input_tokens_seen": 74090460, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.72265625, + "step": 3430, + "time_per_iteration": 3.122990131378174 + }, + { + "auxiliary_loss_clip": 0.015295, + "auxiliary_loss_mlp": 0.01289493, + "balance_loss_clip": 1.17692494, + "balance_loss_mlp": 1.02074742, + "epoch": 0.4125533577827211, + "flos": 21363173390880.0, + "grad_norm": 2.6792955884066623, + "language_loss": 0.7888698, + "learning_rate": 2.6515700273898333e-06, + "loss": 0.8170597, + "num_input_tokens_seen": 74108335, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.68945312, + "step": 3431, + "time_per_iteration": 3.1747841835021973 + }, + { + "auxiliary_loss_clip": 0.01525726, + "auxiliary_loss_mlp": 0.01300624, + "balance_loss_clip": 1.17323554, + "balance_loss_mlp": 1.0332135, + "epoch": 0.4126736006733602, + "flos": 26069953331520.0, + "grad_norm": 2.144630785358514, + "language_loss": 0.68907475, + "learning_rate": 2.6508335040127018e-06, + "loss": 0.71733826, + "num_input_tokens_seen": 74128030, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.67578125, + "step": 3432, + "time_per_iteration": 3.1267199516296387 + }, + { + "auxiliary_loss_clip": 0.01530517, + "auxiliary_loss_mlp": 0.0129393, + "balance_loss_clip": 1.17814803, + "balance_loss_mlp": 1.02575707, + "epoch": 0.4127938435639993, + "flos": 25668517678560.0, + "grad_norm": 1.5980078369533617, + "language_loss": 0.77037269, + "learning_rate": 2.6500968819046446e-06, + "loss": 0.79861712, + "num_input_tokens_seen": 74148330, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.68359375, + "step": 3433, + "time_per_iteration": 3.004031181335449 + }, + { + "auxiliary_loss_clip": 0.0152771, + "auxiliary_loss_mlp": 0.01290179, + "balance_loss_clip": 1.17572689, + "balance_loss_mlp": 1.02276874, + "epoch": 0.4129140864546384, + "flos": 17997394955040.0, + "grad_norm": 3.2777160108696823, + "language_loss": 0.5906024, + "learning_rate": 2.649360161177408e-06, + "loss": 0.61878133, + "num_input_tokens_seen": 74163390, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.67578125, + "step": 3434, + "time_per_iteration": 2.941260814666748 + }, + { + "auxiliary_loss_clip": 0.01529136, + "auxiliary_loss_mlp": 0.01283523, + "balance_loss_clip": 1.17716479, + "balance_loss_mlp": 1.01534927, + "epoch": 0.41303432934527745, + "flos": 23735186269920.0, + "grad_norm": 1.8032049898172156, + "language_loss": 0.73540986, + "learning_rate": 2.6486233419427504e-06, + "loss": 0.76353645, + "num_input_tokens_seen": 74183205, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.68359375, + "step": 3435, + "time_per_iteration": 3.0176422595977783 + }, + { + "auxiliary_loss_clip": 0.01528118, + "auxiliary_loss_mlp": 0.01292251, + "balance_loss_clip": 1.17553055, + "balance_loss_mlp": 1.02026296, + "epoch": 0.41315457223591656, + "flos": 19757316994560.0, + "grad_norm": 3.3935974023548163, + "language_loss": 0.75335014, + "learning_rate": 2.6478864243124484e-06, + "loss": 0.78155386, + "num_input_tokens_seen": 74202870, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 2.72265625, + "step": 3436, + "time_per_iteration": 2.9713244438171387 + }, + { + "auxiliary_loss_clip": 0.01524071, + "auxiliary_loss_mlp": 0.01288169, + "balance_loss_clip": 1.17009354, + "balance_loss_mlp": 1.01846957, + "epoch": 0.4132748151265556, + "flos": 20925440124480.0, + "grad_norm": 3.038760099037988, + "language_loss": 0.85438716, + "learning_rate": 2.6471494083982903e-06, + "loss": 0.88250959, + "num_input_tokens_seen": 74222255, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.69921875, + "step": 3437, + "time_per_iteration": 3.0429954528808594 + }, + { + "auxiliary_loss_clip": 0.01524909, + "auxiliary_loss_mlp": 0.01297378, + "balance_loss_clip": 1.17222941, + "balance_loss_mlp": 1.02748799, + "epoch": 0.4133950580171947, + "flos": 32235199381440.0, + "grad_norm": 1.9254858923904061, + "language_loss": 0.74692947, + "learning_rate": 2.6464122943120818e-06, + "loss": 0.77515233, + "num_input_tokens_seen": 74242480, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 2.70117188, + "step": 3438, + "time_per_iteration": 3.0102486610412598 + }, + { + "auxiliary_loss_clip": 0.0152705, + "auxiliary_loss_mlp": 0.01297881, + "balance_loss_clip": 1.17470145, + "balance_loss_mlp": 1.03027987, + "epoch": 0.41351530090783384, + "flos": 23294608391520.0, + "grad_norm": 3.5348937017766935, + "language_loss": 0.82402545, + "learning_rate": 2.645675082165642e-06, + "loss": 0.85227472, + "num_input_tokens_seen": 74258690, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.67773438, + "step": 3439, + "time_per_iteration": 3.0413076877593994 + }, + { + "auxiliary_loss_clip": 0.0153014, + "auxiliary_loss_mlp": 0.01296931, + "balance_loss_clip": 1.17899537, + "balance_loss_mlp": 1.02570581, + "epoch": 0.4136355437984729, + "flos": 25595960379840.0, + "grad_norm": 2.116470742226396, + "language_loss": 0.75918299, + "learning_rate": 2.644937772070806e-06, + "loss": 0.78745371, + "num_input_tokens_seen": 74277135, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.71484375, + "step": 3440, + "time_per_iteration": 2.9645166397094727 + }, + { + "auxiliary_loss_clip": 0.01536916, + "auxiliary_loss_mlp": 0.01293009, + "balance_loss_clip": 1.18471169, + "balance_loss_mlp": 1.02712476, + "epoch": 0.413755786689112, + "flos": 19830670784640.0, + "grad_norm": 4.047219211693978, + "language_loss": 0.83518219, + "learning_rate": 2.6442003641394225e-06, + "loss": 0.86348152, + "num_input_tokens_seen": 74294730, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.66015625, + "step": 3441, + "time_per_iteration": 2.9906980991363525 + }, + { + "auxiliary_loss_clip": 0.01533425, + "auxiliary_loss_mlp": 0.0129988, + "balance_loss_clip": 1.18224478, + "balance_loss_mlp": 1.0334239, + "epoch": 0.4138760295797511, + "flos": 26872900493760.0, + "grad_norm": 1.4932567132722487, + "language_loss": 0.84018183, + "learning_rate": 2.643462858483356e-06, + "loss": 0.86851478, + "num_input_tokens_seen": 74315015, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.66601562, + "step": 3442, + "time_per_iteration": 3.11112117767334 + }, + { + "auxiliary_loss_clip": 0.01535868, + "auxiliary_loss_mlp": 0.01301794, + "balance_loss_clip": 1.18433809, + "balance_loss_mlp": 1.03209496, + "epoch": 0.41399627247039017, + "flos": 16401134383200.0, + "grad_norm": 3.0267466987676146, + "language_loss": 0.73330683, + "learning_rate": 2.6427252552144856e-06, + "loss": 0.76168346, + "num_input_tokens_seen": 74333665, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.69921875, + "step": 3443, + "time_per_iteration": 3.121304512023926 + }, + { + "auxiliary_loss_clip": 0.01538766, + "auxiliary_loss_mlp": 0.01307298, + "balance_loss_clip": 1.18699253, + "balance_loss_mlp": 1.03817141, + "epoch": 0.4141165153610293, + "flos": 22932466676640.0, + "grad_norm": 3.0494903959140336, + "language_loss": 0.75256443, + "learning_rate": 2.6419875544447044e-06, + "loss": 0.78102505, + "num_input_tokens_seen": 74355065, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.69335938, + "step": 3444, + "time_per_iteration": 3.093135356903076 + }, + { + "auxiliary_loss_clip": 0.01530949, + "auxiliary_loss_mlp": 0.0129835, + "balance_loss_clip": 1.17916298, + "balance_loss_mlp": 1.02903223, + "epoch": 0.4142367582516684, + "flos": 25194941936640.0, + "grad_norm": 1.7315548233127047, + "language_loss": 0.71611995, + "learning_rate": 2.6412497562859218e-06, + "loss": 0.7444129, + "num_input_tokens_seen": 74376345, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 2.6953125, + "step": 3445, + "time_per_iteration": 3.1077992916107178 + }, + { + "auxiliary_loss_clip": 0.01539282, + "auxiliary_loss_mlp": 0.01290628, + "balance_loss_clip": 1.18834901, + "balance_loss_mlp": 1.02188301, + "epoch": 0.41435700114230745, + "flos": 21692696523840.0, + "grad_norm": 2.1902615400936947, + "language_loss": 0.76563263, + "learning_rate": 2.6405118608500617e-06, + "loss": 0.79393172, + "num_input_tokens_seen": 74395170, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.68945312, + "step": 3446, + "time_per_iteration": 3.1177449226379395 + }, + { + "auxiliary_loss_clip": 0.01545518, + "auxiliary_loss_mlp": 0.01276146, + "balance_loss_clip": 1.19382954, + "balance_loss_mlp": 1.00988042, + "epoch": 0.41447724403294656, + "flos": 25997585673600.0, + "grad_norm": 2.244926671562138, + "language_loss": 0.81475902, + "learning_rate": 2.6397738682490613e-06, + "loss": 0.84297568, + "num_input_tokens_seen": 74416070, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.6640625, + "step": 3447, + "time_per_iteration": 3.9620516300201416 + }, + { + "auxiliary_loss_clip": 0.01528104, + "auxiliary_loss_mlp": 0.01289725, + "balance_loss_clip": 1.17483258, + "balance_loss_mlp": 1.02193296, + "epoch": 0.41459748692358567, + "flos": 18261642996000.0, + "grad_norm": 2.8045127011680897, + "language_loss": 0.75283766, + "learning_rate": 2.6390357785948734e-06, + "loss": 0.78101599, + "num_input_tokens_seen": 74433185, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.6796875, + "step": 3448, + "time_per_iteration": 3.9275941848754883 + }, + { + "auxiliary_loss_clip": 0.01537759, + "auxiliary_loss_mlp": 0.01297321, + "balance_loss_clip": 1.18476319, + "balance_loss_mlp": 1.02857518, + "epoch": 0.4147177298142247, + "flos": 24170643846720.0, + "grad_norm": 2.0366675409682524, + "language_loss": 0.80303782, + "learning_rate": 2.6382975919994667e-06, + "loss": 0.83138859, + "num_input_tokens_seen": 74453760, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.68945312, + "step": 3449, + "time_per_iteration": 3.8926172256469727 + }, + { + "auxiliary_loss_clip": 0.01544721, + "auxiliary_loss_mlp": 0.01289775, + "balance_loss_clip": 1.19327044, + "balance_loss_mlp": 1.0246532, + "epoch": 0.41483797270486383, + "flos": 20086764271200.0, + "grad_norm": 1.9531568766104543, + "language_loss": 0.72899878, + "learning_rate": 2.637559308574822e-06, + "loss": 0.75734365, + "num_input_tokens_seen": 74473505, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.65234375, + "step": 3450, + "time_per_iteration": 3.035456657409668 + }, + { + "auxiliary_loss_clip": 0.01535145, + "auxiliary_loss_mlp": 0.01294488, + "balance_loss_clip": 1.1836772, + "balance_loss_mlp": 1.02726865, + "epoch": 0.4149582155955029, + "flos": 30083665082400.0, + "grad_norm": 2.2037951507837725, + "language_loss": 0.7132436, + "learning_rate": 2.6368209284329376e-06, + "loss": 0.7415399, + "num_input_tokens_seen": 74494135, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.67382812, + "step": 3451, + "time_per_iteration": 3.0808749198913574 + }, + { + "auxiliary_loss_clip": 0.01541795, + "auxiliary_loss_mlp": 0.01298202, + "balance_loss_clip": 1.18965304, + "balance_loss_mlp": 1.02888453, + "epoch": 0.415078458486142, + "flos": 16766310350880.0, + "grad_norm": 2.1517379392582554, + "language_loss": 0.7551232, + "learning_rate": 2.636082451685825e-06, + "loss": 0.7835232, + "num_input_tokens_seen": 74512335, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.6953125, + "step": 3452, + "time_per_iteration": 3.885922431945801 + }, + { + "auxiliary_loss_clip": 0.01538804, + "auxiliary_loss_mlp": 0.01299286, + "balance_loss_clip": 1.18700957, + "balance_loss_mlp": 1.03187609, + "epoch": 0.4151987013767811, + "flos": 26036234832960.0, + "grad_norm": 2.7613112448531694, + "language_loss": 0.86478567, + "learning_rate": 2.6353438784455094e-06, + "loss": 0.89316654, + "num_input_tokens_seen": 74535620, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.67578125, + "step": 3453, + "time_per_iteration": 3.0463650226593018 + }, + { + "auxiliary_loss_clip": 0.01547354, + "auxiliary_loss_mlp": 0.01302892, + "balance_loss_clip": 1.19583869, + "balance_loss_mlp": 1.03395545, + "epoch": 0.41531894426742016, + "flos": 24610311449280.0, + "grad_norm": 3.5142866054586253, + "language_loss": 0.7201162, + "learning_rate": 2.6346052088240326e-06, + "loss": 0.74861866, + "num_input_tokens_seen": 74555140, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.69140625, + "step": 3454, + "time_per_iteration": 2.987651824951172 + }, + { + "auxiliary_loss_clip": 0.0153202, + "auxiliary_loss_mlp": 0.01297664, + "balance_loss_clip": 1.17954206, + "balance_loss_mlp": 1.03120768, + "epoch": 0.4154391871580593, + "flos": 14977562545440.0, + "grad_norm": 3.736001804068816, + "language_loss": 0.77693713, + "learning_rate": 2.63386644293345e-06, + "loss": 0.80523396, + "num_input_tokens_seen": 74571485, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.66601562, + "step": 3455, + "time_per_iteration": 2.9398763179779053 + }, + { + "auxiliary_loss_clip": 0.01537226, + "auxiliary_loss_mlp": 0.01288309, + "balance_loss_clip": 1.18506944, + "balance_loss_mlp": 1.02299666, + "epoch": 0.4155594300486984, + "flos": 14648911760160.0, + "grad_norm": 4.076950787298842, + "language_loss": 0.82856011, + "learning_rate": 2.633127580885833e-06, + "loss": 0.85681546, + "num_input_tokens_seen": 74585985, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.65429688, + "step": 3456, + "time_per_iteration": 3.037715196609497 + }, + { + "auxiliary_loss_clip": 0.01540336, + "auxiliary_loss_mlp": 0.01287692, + "balance_loss_clip": 1.1891222, + "balance_loss_mlp": 1.02180707, + "epoch": 0.41567967293933744, + "flos": 29499869014560.0, + "grad_norm": 5.196031776714694, + "language_loss": 0.64669073, + "learning_rate": 2.632388622793265e-06, + "loss": 0.67497098, + "num_input_tokens_seen": 74605140, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.66015625, + "step": 3457, + "time_per_iteration": 3.0233891010284424 + }, + { + "auxiliary_loss_clip": 0.01543857, + "auxiliary_loss_mlp": 0.01297822, + "balance_loss_clip": 1.1926527, + "balance_loss_mlp": 1.02964854, + "epoch": 0.41579991582997655, + "flos": 19240047648000.0, + "grad_norm": 2.135555490845609, + "language_loss": 0.67564315, + "learning_rate": 2.6316495687678457e-06, + "loss": 0.70405996, + "num_input_tokens_seen": 74623790, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.68359375, + "step": 3458, + "time_per_iteration": 2.9589900970458984 + }, + { + "auxiliary_loss_clip": 0.01534937, + "auxiliary_loss_mlp": 0.01290227, + "balance_loss_clip": 1.18317628, + "balance_loss_mlp": 1.02338862, + "epoch": 0.41592015872061566, + "flos": 24464362432320.0, + "grad_norm": 3.5611413083226964, + "language_loss": 0.76935142, + "learning_rate": 2.6309104189216887e-06, + "loss": 0.79760301, + "num_input_tokens_seen": 74641355, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.66992188, + "step": 3459, + "time_per_iteration": 3.0006091594696045 + }, + { + "auxiliary_loss_clip": 0.01538175, + "auxiliary_loss_mlp": 0.01294211, + "balance_loss_clip": 1.18745434, + "balance_loss_mlp": 1.02451181, + "epoch": 0.4160404016112547, + "flos": 20777670555840.0, + "grad_norm": 2.6829876465940625, + "language_loss": 0.75372225, + "learning_rate": 2.630171173366923e-06, + "loss": 0.78204608, + "num_input_tokens_seen": 74657155, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 2.69921875, + "step": 3460, + "time_per_iteration": 2.912564277648926 + }, + { + "auxiliary_loss_clip": 0.01547938, + "auxiliary_loss_mlp": 0.01298829, + "balance_loss_clip": 1.19673562, + "balance_loss_mlp": 1.02989316, + "epoch": 0.41616064450189383, + "flos": 13918104686880.0, + "grad_norm": 2.583079384155451, + "language_loss": 0.74813068, + "learning_rate": 2.629431832215691e-06, + "loss": 0.77659833, + "num_input_tokens_seen": 74671960, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.69140625, + "step": 3461, + "time_per_iteration": 3.0265908241271973 + }, + { + "auxiliary_loss_clip": 0.01544626, + "auxiliary_loss_mlp": 0.01289591, + "balance_loss_clip": 1.19507432, + "balance_loss_mlp": 1.0235163, + "epoch": 0.41628088739253294, + "flos": 20012879486880.0, + "grad_norm": 2.7095457279060837, + "language_loss": 0.87075567, + "learning_rate": 2.628692395580151e-06, + "loss": 0.8990978, + "num_input_tokens_seen": 74692050, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 2.66210938, + "step": 3462, + "time_per_iteration": 3.0610313415527344 + }, + { + "auxiliary_loss_clip": 0.01543888, + "auxiliary_loss_mlp": 0.01299354, + "balance_loss_clip": 1.19226646, + "balance_loss_mlp": 1.03079915, + "epoch": 0.416401130283172, + "flos": 29171369941920.0, + "grad_norm": 2.0736669702644703, + "language_loss": 0.79483199, + "learning_rate": 2.6279528635724747e-06, + "loss": 0.82326448, + "num_input_tokens_seen": 74712205, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.6875, + "step": 3463, + "time_per_iteration": 3.0473060607910156 + }, + { + "auxiliary_loss_clip": 0.0154149, + "auxiliary_loss_mlp": 0.0130165, + "balance_loss_clip": 1.18971288, + "balance_loss_mlp": 1.03061604, + "epoch": 0.4165213731738111, + "flos": 16248320369280.0, + "grad_norm": 3.636296073719079, + "language_loss": 0.78492194, + "learning_rate": 2.627213236304848e-06, + "loss": 0.8133533, + "num_input_tokens_seen": 74729005, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.71289062, + "step": 3464, + "time_per_iteration": 3.074192523956299 + }, + { + "auxiliary_loss_clip": 0.01543927, + "auxiliary_loss_mlp": 0.01303021, + "balance_loss_clip": 1.19300461, + "balance_loss_mlp": 1.03599226, + "epoch": 0.4166416160644502, + "flos": 33768232981920.0, + "grad_norm": 2.3399334820259847, + "language_loss": 0.70602304, + "learning_rate": 2.626473513889472e-06, + "loss": 0.73449254, + "num_input_tokens_seen": 74751385, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.671875, + "step": 3465, + "time_per_iteration": 3.220235586166382 + }, + { + "auxiliary_loss_clip": 0.0154357, + "auxiliary_loss_mlp": 0.01290667, + "balance_loss_clip": 1.19198167, + "balance_loss_mlp": 1.02363825, + "epoch": 0.41676185895508927, + "flos": 20919295762560.0, + "grad_norm": 2.3692798345916306, + "language_loss": 0.82745981, + "learning_rate": 2.625733696438562e-06, + "loss": 0.85580224, + "num_input_tokens_seen": 74768890, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.671875, + "step": 3466, + "time_per_iteration": 3.014284610748291 + }, + { + "auxiliary_loss_clip": 0.01551368, + "auxiliary_loss_mlp": 0.01298034, + "balance_loss_clip": 1.20047116, + "balance_loss_mlp": 1.03005123, + "epoch": 0.4168821018457284, + "flos": 18408123007200.0, + "grad_norm": 5.0685068711110155, + "language_loss": 0.75356215, + "learning_rate": 2.6249937840643476e-06, + "loss": 0.78205621, + "num_input_tokens_seen": 74787195, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.68164062, + "step": 3467, + "time_per_iteration": 2.9819867610931396 + }, + { + "auxiliary_loss_clip": 0.01556273, + "auxiliary_loss_mlp": 0.01296843, + "balance_loss_clip": 1.20565796, + "balance_loss_mlp": 1.02790725, + "epoch": 0.41700234473636744, + "flos": 18700438250880.0, + "grad_norm": 1.7914348759827958, + "language_loss": 0.66659129, + "learning_rate": 2.6242537768790733e-06, + "loss": 0.69512242, + "num_input_tokens_seen": 74806350, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 2.69140625, + "step": 3468, + "time_per_iteration": 3.0051937103271484 + }, + { + "auxiliary_loss_clip": 0.01548956, + "auxiliary_loss_mlp": 0.0130053, + "balance_loss_clip": 1.19941163, + "balance_loss_mlp": 1.02968669, + "epoch": 0.41712258762700655, + "flos": 31036164436800.0, + "grad_norm": 2.6132332282721826, + "language_loss": 0.68512231, + "learning_rate": 2.6235136749949975e-06, + "loss": 0.71361721, + "num_input_tokens_seen": 74829800, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 2.7109375, + "step": 3469, + "time_per_iteration": 3.124014377593994 + }, + { + "auxiliary_loss_clip": 0.01550794, + "auxiliary_loss_mlp": 0.0128669, + "balance_loss_clip": 1.19937706, + "balance_loss_mlp": 1.02061546, + "epoch": 0.41724283051764566, + "flos": 35917491591360.0, + "grad_norm": 2.2904417032980566, + "language_loss": 0.6112051, + "learning_rate": 2.6227734785243924e-06, + "loss": 0.63957989, + "num_input_tokens_seen": 74849760, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.66210938, + "step": 3470, + "time_per_iteration": 3.062244176864624 + }, + { + "auxiliary_loss_clip": 0.01545165, + "auxiliary_loss_mlp": 0.01290382, + "balance_loss_clip": 1.19441819, + "balance_loss_mlp": 1.02278101, + "epoch": 0.4173630734082847, + "flos": 25335770652000.0, + "grad_norm": 2.385406458399396, + "language_loss": 0.79052699, + "learning_rate": 2.6220331875795466e-06, + "loss": 0.81888247, + "num_input_tokens_seen": 74869110, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 2.67773438, + "step": 3471, + "time_per_iteration": 3.0529255867004395 + }, + { + "auxiliary_loss_clip": 0.015511, + "auxiliary_loss_mlp": 0.01301276, + "balance_loss_clip": 1.19994617, + "balance_loss_mlp": 1.03157663, + "epoch": 0.4174833162989238, + "flos": 26687695466880.0, + "grad_norm": 1.691554513108874, + "language_loss": 0.74979579, + "learning_rate": 2.62129280227276e-06, + "loss": 0.77831948, + "num_input_tokens_seen": 74889110, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.69921875, + "step": 3472, + "time_per_iteration": 3.0536043643951416 + }, + { + "auxiliary_loss_clip": 0.01555499, + "auxiliary_loss_mlp": 0.0129983, + "balance_loss_clip": 1.20496535, + "balance_loss_mlp": 1.03203833, + "epoch": 0.41760355918956293, + "flos": 74744709050880.0, + "grad_norm": 3.1942420884247116, + "language_loss": 0.67969686, + "learning_rate": 2.62055232271635e-06, + "loss": 0.70825016, + "num_input_tokens_seen": 74916260, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 2.6796875, + "step": 3473, + "time_per_iteration": 3.2924644947052 + }, + { + "auxiliary_loss_clip": 0.01550666, + "auxiliary_loss_mlp": 0.01277653, + "balance_loss_clip": 1.20032513, + "balance_loss_mlp": 1.01310396, + "epoch": 0.417723802080202, + "flos": 14319161058240.0, + "grad_norm": 2.357941646265236, + "language_loss": 0.88063473, + "learning_rate": 2.619811749022646e-06, + "loss": 0.9089179, + "num_input_tokens_seen": 74931570, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 2.64648438, + "step": 3474, + "time_per_iteration": 3.8801209926605225 + }, + { + "auxiliary_loss_clip": 0.01552736, + "auxiliary_loss_mlp": 0.01294164, + "balance_loss_clip": 1.2020061, + "balance_loss_mlp": 1.02351117, + "epoch": 0.4178440449708411, + "flos": 14645346513120.0, + "grad_norm": 2.4767764257411047, + "language_loss": 0.71665037, + "learning_rate": 2.6190710813039917e-06, + "loss": 0.74511945, + "num_input_tokens_seen": 74944695, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 2.70898438, + "step": 3475, + "time_per_iteration": 3.979475498199463 + }, + { + "auxiliary_loss_clip": 0.01538037, + "auxiliary_loss_mlp": 0.01284426, + "balance_loss_clip": 1.1863749, + "balance_loss_mlp": 1.01548958, + "epoch": 0.4179642878614802, + "flos": 21509463761280.0, + "grad_norm": 4.254422273615974, + "language_loss": 0.83745646, + "learning_rate": 2.618330319672747e-06, + "loss": 0.86568105, + "num_input_tokens_seen": 74964115, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.69140625, + "step": 3476, + "time_per_iteration": 3.0198657512664795 + }, + { + "auxiliary_loss_clip": 0.01541736, + "auxiliary_loss_mlp": 0.01292855, + "balance_loss_clip": 1.19006515, + "balance_loss_mlp": 1.02468157, + "epoch": 0.41808453075211927, + "flos": 18443965482720.0, + "grad_norm": 3.391405941522008, + "language_loss": 0.92026162, + "learning_rate": 2.617589464241284e-06, + "loss": 0.9486075, + "num_input_tokens_seen": 74978515, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.68359375, + "step": 3477, + "time_per_iteration": 3.7530081272125244 + }, + { + "auxiliary_loss_clip": 0.01546721, + "auxiliary_loss_mlp": 0.01298637, + "balance_loss_clip": 1.19534469, + "balance_loss_mlp": 1.03065491, + "epoch": 0.4182047736427584, + "flos": 20302994897280.0, + "grad_norm": 2.1895864740881095, + "language_loss": 0.74254644, + "learning_rate": 2.6168485151219914e-06, + "loss": 0.77100003, + "num_input_tokens_seen": 74998135, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.68164062, + "step": 3478, + "time_per_iteration": 2.9961822032928467 + }, + { + "auxiliary_loss_clip": 0.01552151, + "auxiliary_loss_mlp": 0.01302013, + "balance_loss_clip": 1.20064616, + "balance_loss_mlp": 1.03403091, + "epoch": 0.4183250165333975, + "flos": 18878436927360.0, + "grad_norm": 2.6389709662683307, + "language_loss": 0.71387136, + "learning_rate": 2.616107472427269e-06, + "loss": 0.74241292, + "num_input_tokens_seen": 75012830, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 2.68164062, + "step": 3479, + "time_per_iteration": 2.9638452529907227 + }, + { + "auxiliary_loss_clip": 0.01538777, + "auxiliary_loss_mlp": 0.0128213, + "balance_loss_clip": 1.1872772, + "balance_loss_mlp": 1.01548314, + "epoch": 0.41844525942403654, + "flos": 17741491109280.0, + "grad_norm": 2.4343900602204607, + "language_loss": 0.76103067, + "learning_rate": 2.615366336269533e-06, + "loss": 0.78923976, + "num_input_tokens_seen": 75026495, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 2.66796875, + "step": 3480, + "time_per_iteration": 3.775921106338501 + }, + { + "auxiliary_loss_clip": 0.01544561, + "auxiliary_loss_mlp": 0.01297819, + "balance_loss_clip": 1.19265008, + "balance_loss_mlp": 1.02812004, + "epoch": 0.41856550231467565, + "flos": 18362912276160.0, + "grad_norm": 4.114221474181943, + "language_loss": 0.80529881, + "learning_rate": 2.6146251067612126e-06, + "loss": 0.83372265, + "num_input_tokens_seen": 75041970, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.69921875, + "step": 3481, + "time_per_iteration": 2.9554800987243652 + }, + { + "auxiliary_loss_clip": 0.01544742, + "auxiliary_loss_mlp": 0.01294758, + "balance_loss_clip": 1.19311666, + "balance_loss_mlp": 1.02448654, + "epoch": 0.41868574520531476, + "flos": 22783900616640.0, + "grad_norm": 1.9848228117202125, + "language_loss": 0.82641566, + "learning_rate": 2.6138837840147525e-06, + "loss": 0.8548106, + "num_input_tokens_seen": 75061005, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.70507812, + "step": 3482, + "time_per_iteration": 3.000436782836914 + }, + { + "auxiliary_loss_clip": 0.01540266, + "auxiliary_loss_mlp": 0.01291153, + "balance_loss_clip": 1.188169, + "balance_loss_mlp": 1.02164459, + "epoch": 0.4188059880959538, + "flos": 13700887928640.0, + "grad_norm": 2.0221759177729264, + "language_loss": 0.76745296, + "learning_rate": 2.6131423681426103e-06, + "loss": 0.79576713, + "num_input_tokens_seen": 75076920, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 2.69726562, + "step": 3483, + "time_per_iteration": 3.077199935913086 + }, + { + "auxiliary_loss_clip": 0.01547358, + "auxiliary_loss_mlp": 0.01286417, + "balance_loss_clip": 1.19565177, + "balance_loss_mlp": 1.02129555, + "epoch": 0.41892623098659293, + "flos": 37821466239840.0, + "grad_norm": 1.9219687006193444, + "language_loss": 0.72752589, + "learning_rate": 2.6124008592572587e-06, + "loss": 0.75586367, + "num_input_tokens_seen": 75100905, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 2.65234375, + "step": 3484, + "time_per_iteration": 3.3266100883483887 + }, + { + "auxiliary_loss_clip": 0.01533634, + "auxiliary_loss_mlp": 0.01294985, + "balance_loss_clip": 1.18140721, + "balance_loss_mlp": 1.02471352, + "epoch": 0.419046473877232, + "flos": 23261155390080.0, + "grad_norm": 2.6180987795230637, + "language_loss": 0.81529152, + "learning_rate": 2.6116592574711835e-06, + "loss": 0.84357774, + "num_input_tokens_seen": 75119205, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.70507812, + "step": 3485, + "time_per_iteration": 3.1141040325164795 + }, + { + "auxiliary_loss_clip": 0.01541795, + "auxiliary_loss_mlp": 0.01314379, + "balance_loss_clip": 1.19010627, + "balance_loss_mlp": 1.04563367, + "epoch": 0.4191667167678711, + "flos": 20743155565920.0, + "grad_norm": 4.2830218830068025, + "language_loss": 0.84284753, + "learning_rate": 2.6109175628968853e-06, + "loss": 0.8714093, + "num_input_tokens_seen": 75138970, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 2.68945312, + "step": 3486, + "time_per_iteration": 3.152282953262329 + }, + { + "auxiliary_loss_clip": 0.01533788, + "auxiliary_loss_mlp": 0.01295678, + "balance_loss_clip": 1.1800158, + "balance_loss_mlp": 1.02922106, + "epoch": 0.4192869596585102, + "flos": 23588668330560.0, + "grad_norm": 1.975897973149427, + "language_loss": 0.82671058, + "learning_rate": 2.610175775646878e-06, + "loss": 0.8550052, + "num_input_tokens_seen": 75157550, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.66601562, + "step": 3487, + "time_per_iteration": 3.083527088165283 + }, + { + "auxiliary_loss_clip": 0.01532747, + "auxiliary_loss_mlp": 0.01307801, + "balance_loss_clip": 1.17910838, + "balance_loss_mlp": 1.0394367, + "epoch": 0.41940720254914926, + "flos": 25083545837760.0, + "grad_norm": 2.1697014507827874, + "language_loss": 0.72954762, + "learning_rate": 2.6094338958336907e-06, + "loss": 0.75795311, + "num_input_tokens_seen": 75176220, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.68554688, + "step": 3488, + "time_per_iteration": 3.034428358078003 + }, + { + "auxiliary_loss_clip": 0.01531223, + "auxiliary_loss_mlp": 0.01297116, + "balance_loss_clip": 1.17690992, + "balance_loss_mlp": 1.03256726, + "epoch": 0.41952744543978837, + "flos": 15555631461120.0, + "grad_norm": 2.13419015722523, + "language_loss": 0.82190526, + "learning_rate": 2.608691923569867e-06, + "loss": 0.85018861, + "num_input_tokens_seen": 75193095, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.64648438, + "step": 3489, + "time_per_iteration": 2.988199234008789 + }, + { + "auxiliary_loss_clip": 0.01531519, + "auxiliary_loss_mlp": 0.01301324, + "balance_loss_clip": 1.17725194, + "balance_loss_mlp": 1.03276944, + "epoch": 0.4196476883304275, + "flos": 24647102128800.0, + "grad_norm": 1.6155772937643882, + "language_loss": 0.75683129, + "learning_rate": 2.6079498589679616e-06, + "loss": 0.78515971, + "num_input_tokens_seen": 75214185, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.6875, + "step": 3490, + "time_per_iteration": 2.999624252319336 + }, + { + "auxiliary_loss_clip": 0.0153187, + "auxiliary_loss_mlp": 0.01305528, + "balance_loss_clip": 1.17748976, + "balance_loss_mlp": 1.03277659, + "epoch": 0.41976793122106654, + "flos": 24533430340320.0, + "grad_norm": 2.7685524787549425, + "language_loss": 0.7617017, + "learning_rate": 2.6072077021405465e-06, + "loss": 0.79007578, + "num_input_tokens_seen": 75233020, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.73046875, + "step": 3491, + "time_per_iteration": 3.0278537273406982 + }, + { + "auxiliary_loss_clip": 0.01528399, + "auxiliary_loss_mlp": 0.01293606, + "balance_loss_clip": 1.17324662, + "balance_loss_mlp": 1.02638626, + "epoch": 0.41988817411170565, + "flos": 21177247728960.0, + "grad_norm": 2.0252666617020783, + "language_loss": 0.69253421, + "learning_rate": 2.6064654532002054e-06, + "loss": 0.72075427, + "num_input_tokens_seen": 75252030, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.67382812, + "step": 3492, + "time_per_iteration": 3.1977791786193848 + }, + { + "auxiliary_loss_clip": 0.01524994, + "auxiliary_loss_mlp": 0.01292405, + "balance_loss_clip": 1.17001843, + "balance_loss_mlp": 1.02442288, + "epoch": 0.42000841700234476, + "flos": 31652086020480.0, + "grad_norm": 1.5943714400576516, + "language_loss": 0.75871688, + "learning_rate": 2.6057231122595375e-06, + "loss": 0.78689092, + "num_input_tokens_seen": 75273340, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.68164062, + "step": 3493, + "time_per_iteration": 3.171476125717163 + }, + { + "auxiliary_loss_clip": 0.01528083, + "auxiliary_loss_mlp": 0.01291521, + "balance_loss_clip": 1.17218757, + "balance_loss_mlp": 1.02601814, + "epoch": 0.4201286598929838, + "flos": 21283295957280.0, + "grad_norm": 1.6860671972104067, + "language_loss": 0.73121727, + "learning_rate": 2.604980679431154e-06, + "loss": 0.75941336, + "num_input_tokens_seen": 75291580, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.65625, + "step": 3494, + "time_per_iteration": 3.0232138633728027 + }, + { + "auxiliary_loss_clip": 0.01527229, + "auxiliary_loss_mlp": 0.01301874, + "balance_loss_clip": 1.17191112, + "balance_loss_mlp": 1.03255641, + "epoch": 0.4202489027836229, + "flos": 18548231087520.0, + "grad_norm": 2.101874858324714, + "language_loss": 0.74913442, + "learning_rate": 2.604238154827684e-06, + "loss": 0.77742541, + "num_input_tokens_seen": 75308205, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.6953125, + "step": 3495, + "time_per_iteration": 3.2022476196289062 + }, + { + "auxiliary_loss_clip": 0.01539487, + "auxiliary_loss_mlp": 0.012763, + "balance_loss_clip": 1.18520427, + "balance_loss_mlp": 1.01270449, + "epoch": 0.42036914567426203, + "flos": 19319659584480.0, + "grad_norm": 2.019470910212401, + "language_loss": 0.73088437, + "learning_rate": 2.6034955385617656e-06, + "loss": 0.75904226, + "num_input_tokens_seen": 75326535, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.63671875, + "step": 3496, + "time_per_iteration": 2.990245819091797 + }, + { + "auxiliary_loss_clip": 0.01683265, + "auxiliary_loss_mlp": 0.0123143, + "balance_loss_clip": 1.33696079, + "balance_loss_mlp": 1.01475525, + "epoch": 0.4204893885649011, + "flos": 67849679623680.0, + "grad_norm": 0.7266233348422415, + "language_loss": 0.61546397, + "learning_rate": 2.6027528307460544e-06, + "loss": 0.644611, + "num_input_tokens_seen": 75390540, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 2.171875, + "step": 3497, + "time_per_iteration": 3.5889875888824463 + }, + { + "auxiliary_loss_clip": 0.01526432, + "auxiliary_loss_mlp": 0.01293732, + "balance_loss_clip": 1.17137003, + "balance_loss_mlp": 1.02784765, + "epoch": 0.4206096314555402, + "flos": 21910975270560.0, + "grad_norm": 2.096051319084098, + "language_loss": 0.86502773, + "learning_rate": 2.602010031493217e-06, + "loss": 0.89322937, + "num_input_tokens_seen": 75408770, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.66015625, + "step": 3498, + "time_per_iteration": 3.0680408477783203 + }, + { + "auxiliary_loss_clip": 0.01528064, + "auxiliary_loss_mlp": 0.01287421, + "balance_loss_clip": 1.17233598, + "balance_loss_mlp": 1.02153707, + "epoch": 0.42072987434617926, + "flos": 29280755848320.0, + "grad_norm": 3.0801051812636104, + "language_loss": 0.87175846, + "learning_rate": 2.6012671409159367e-06, + "loss": 0.89991331, + "num_input_tokens_seen": 75430105, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.66015625, + "step": 3499, + "time_per_iteration": 3.1230099201202393 + }, + { + "auxiliary_loss_clip": 0.01529971, + "auxiliary_loss_mlp": 0.01295923, + "balance_loss_clip": 1.1743232, + "balance_loss_mlp": 1.02603316, + "epoch": 0.42085011723681837, + "flos": 27603252429120.0, + "grad_norm": 2.900018602558805, + "language_loss": 0.8202076, + "learning_rate": 2.6005241591269097e-06, + "loss": 0.84846652, + "num_input_tokens_seen": 75449475, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.70117188, + "step": 3500, + "time_per_iteration": 3.0917885303497314 + }, + { + "auxiliary_loss_clip": 0.0153384, + "auxiliary_loss_mlp": 0.01297969, + "balance_loss_clip": 1.17896903, + "balance_loss_mlp": 1.03094029, + "epoch": 0.4209703601274575, + "flos": 27821113966080.0, + "grad_norm": 7.914175699712419, + "language_loss": 0.80030608, + "learning_rate": 2.5997810862388454e-06, + "loss": 0.82862413, + "num_input_tokens_seen": 75469315, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.671875, + "step": 3501, + "time_per_iteration": 3.0849740505218506 + }, + { + "auxiliary_loss_clip": 0.01529364, + "auxiliary_loss_mlp": 0.01294633, + "balance_loss_clip": 1.17440617, + "balance_loss_mlp": 1.025316, + "epoch": 0.42109060301809653, + "flos": 27527926374720.0, + "grad_norm": 2.065654231259178, + "language_loss": 0.75791335, + "learning_rate": 2.599037922364467e-06, + "loss": 0.78615332, + "num_input_tokens_seen": 75488215, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.6953125, + "step": 3502, + "time_per_iteration": 3.9695816040039062 + }, + { + "auxiliary_loss_clip": 0.01525294, + "auxiliary_loss_mlp": 0.01294811, + "balance_loss_clip": 1.17054844, + "balance_loss_mlp": 1.02663732, + "epoch": 0.42121084590873564, + "flos": 29316825892800.0, + "grad_norm": 3.0638292061552614, + "language_loss": 0.75429505, + "learning_rate": 2.5982946676165112e-06, + "loss": 0.78249609, + "num_input_tokens_seen": 75507985, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.68359375, + "step": 3503, + "time_per_iteration": 3.938549518585205 + }, + { + "auxiliary_loss_clip": 0.0167712, + "auxiliary_loss_mlp": 0.01236298, + "balance_loss_clip": 1.3307097, + "balance_loss_mlp": 1.01809692, + "epoch": 0.42133108879937475, + "flos": 67405195144800.0, + "grad_norm": 0.7347137781443723, + "language_loss": 0.57522964, + "learning_rate": 2.5975513221077313e-06, + "loss": 0.6043638, + "num_input_tokens_seen": 75571955, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 2.1875, + "step": 3504, + "time_per_iteration": 3.684183359146118 + }, + { + "auxiliary_loss_clip": 0.01535016, + "auxiliary_loss_mlp": 0.01313228, + "balance_loss_clip": 1.17958832, + "balance_loss_mlp": 1.04371953, + "epoch": 0.4214513316900138, + "flos": 23107924166400.0, + "grad_norm": 2.2732924597851287, + "language_loss": 0.88148594, + "learning_rate": 2.5968078859508897e-06, + "loss": 0.90996838, + "num_input_tokens_seen": 75589155, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.69726562, + "step": 3505, + "time_per_iteration": 3.9883697032928467 + }, + { + "auxiliary_loss_clip": 0.01528088, + "auxiliary_loss_mlp": 0.01295192, + "balance_loss_clip": 1.17146134, + "balance_loss_mlp": 1.02968931, + "epoch": 0.4215715745806529, + "flos": 15337731996000.0, + "grad_norm": 1.8899827096129533, + "language_loss": 0.79997641, + "learning_rate": 2.5960643592587673e-06, + "loss": 0.82820922, + "num_input_tokens_seen": 75606565, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.65625, + "step": 3506, + "time_per_iteration": 3.0702157020568848 + }, + { + "auxiliary_loss_clip": 0.0153315, + "auxiliary_loss_mlp": 0.01296836, + "balance_loss_clip": 1.17626417, + "balance_loss_mlp": 1.02904439, + "epoch": 0.42169181747129203, + "flos": 22129443658080.0, + "grad_norm": 2.0245454674741237, + "language_loss": 0.8130101, + "learning_rate": 2.5953207421441553e-06, + "loss": 0.84131002, + "num_input_tokens_seen": 75625165, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.6796875, + "step": 3507, + "time_per_iteration": 3.0427980422973633 + }, + { + "auxiliary_loss_clip": 0.01541469, + "auxiliary_loss_mlp": 0.01295302, + "balance_loss_clip": 1.18682742, + "balance_loss_mlp": 1.02884483, + "epoch": 0.4218120603619311, + "flos": 22632527872800.0, + "grad_norm": 2.307308687833303, + "language_loss": 0.75730926, + "learning_rate": 2.5945770347198603e-06, + "loss": 0.78567696, + "num_input_tokens_seen": 75643320, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.66601562, + "step": 3508, + "time_per_iteration": 3.889939785003662 + }, + { + "auxiliary_loss_clip": 0.01528264, + "auxiliary_loss_mlp": 0.01286753, + "balance_loss_clip": 1.17206502, + "balance_loss_mlp": 1.02258539, + "epoch": 0.4219323032525702, + "flos": 19684873480320.0, + "grad_norm": 1.8235907944693268, + "language_loss": 0.81805962, + "learning_rate": 2.593833237098701e-06, + "loss": 0.84620976, + "num_input_tokens_seen": 75660920, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.64257812, + "step": 3509, + "time_per_iteration": 3.249937057495117 + }, + { + "auxiliary_loss_clip": 0.01520981, + "auxiliary_loss_mlp": 0.01300375, + "balance_loss_clip": 1.16476226, + "balance_loss_mlp": 1.03296471, + "epoch": 0.4220525461432093, + "flos": 30193392342240.0, + "grad_norm": 2.336244255651473, + "language_loss": 0.62579864, + "learning_rate": 2.593089349393512e-06, + "loss": 0.6540122, + "num_input_tokens_seen": 75681410, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.67578125, + "step": 3510, + "time_per_iteration": 3.1165778636932373 + }, + { + "auxiliary_loss_clip": 0.01530287, + "auxiliary_loss_mlp": 0.01290315, + "balance_loss_clip": 1.17387938, + "balance_loss_mlp": 1.02519345, + "epoch": 0.42217278903384836, + "flos": 24318223774560.0, + "grad_norm": 2.6204996594021366, + "language_loss": 0.8336339, + "learning_rate": 2.592345371717141e-06, + "loss": 0.86183995, + "num_input_tokens_seen": 75700940, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.65234375, + "step": 3511, + "time_per_iteration": 3.015692949295044 + }, + { + "auxiliary_loss_clip": 0.0153007, + "auxiliary_loss_mlp": 0.01298259, + "balance_loss_clip": 1.17419028, + "balance_loss_mlp": 1.02779734, + "epoch": 0.42229303192448747, + "flos": 17094392213760.0, + "grad_norm": 2.273300754092219, + "language_loss": 0.72182828, + "learning_rate": 2.591601304182448e-06, + "loss": 0.75011158, + "num_input_tokens_seen": 75718910, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.70703125, + "step": 3512, + "time_per_iteration": 3.0710792541503906 + }, + { + "auxiliary_loss_clip": 0.01534708, + "auxiliary_loss_mlp": 0.01298784, + "balance_loss_clip": 1.17909348, + "balance_loss_mlp": 1.03308988, + "epoch": 0.4224132748151266, + "flos": 22786783156800.0, + "grad_norm": 1.8996526348297638, + "language_loss": 0.79116893, + "learning_rate": 2.5908571469023067e-06, + "loss": 0.81950384, + "num_input_tokens_seen": 75738395, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.65820312, + "step": 3513, + "time_per_iteration": 3.048131227493286 + }, + { + "auxiliary_loss_clip": 0.01525439, + "auxiliary_loss_mlp": 0.01289247, + "balance_loss_clip": 1.16935265, + "balance_loss_mlp": 1.02279055, + "epoch": 0.42253351770576564, + "flos": 17820875476800.0, + "grad_norm": 2.655919902556904, + "language_loss": 0.75034028, + "learning_rate": 2.5901128999896067e-06, + "loss": 0.77848721, + "num_input_tokens_seen": 75753825, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.66601562, + "step": 3514, + "time_per_iteration": 3.0456089973449707 + }, + { + "auxiliary_loss_clip": 0.01519931, + "auxiliary_loss_mlp": 0.0128621, + "balance_loss_clip": 1.16302717, + "balance_loss_mlp": 1.02051663, + "epoch": 0.42265376059640475, + "flos": 28514940719040.0, + "grad_norm": 1.8635448823219822, + "language_loss": 0.68315428, + "learning_rate": 2.5893685635572487e-06, + "loss": 0.71121567, + "num_input_tokens_seen": 75774675, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.65820312, + "step": 3515, + "time_per_iteration": 3.2147276401519775 + }, + { + "auxiliary_loss_clip": 0.01530722, + "auxiliary_loss_mlp": 0.01295192, + "balance_loss_clip": 1.17391348, + "balance_loss_mlp": 1.03026152, + "epoch": 0.4227740034870438, + "flos": 16255223294400.0, + "grad_norm": 2.2349004240191817, + "language_loss": 0.68838286, + "learning_rate": 2.5886241377181483e-06, + "loss": 0.71664196, + "num_input_tokens_seen": 75793545, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.65039062, + "step": 3516, + "time_per_iteration": 2.972437620162964 + }, + { + "auxiliary_loss_clip": 0.01520786, + "auxiliary_loss_mlp": 0.01291404, + "balance_loss_clip": 1.16347456, + "balance_loss_mlp": 1.021896, + "epoch": 0.4228942463776829, + "flos": 25297728343200.0, + "grad_norm": 1.9764810022331865, + "language_loss": 0.81693602, + "learning_rate": 2.587879622585234e-06, + "loss": 0.84505796, + "num_input_tokens_seen": 75812145, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.69726562, + "step": 3517, + "time_per_iteration": 3.0572261810302734 + }, + { + "auxiliary_loss_clip": 0.01528089, + "auxiliary_loss_mlp": 0.01296079, + "balance_loss_clip": 1.16988719, + "balance_loss_mlp": 1.03076708, + "epoch": 0.423014489268322, + "flos": 26398186907040.0, + "grad_norm": 2.256596113223992, + "language_loss": 0.75817549, + "learning_rate": 2.5871350182714486e-06, + "loss": 0.78641713, + "num_input_tokens_seen": 75833025, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.65429688, + "step": 3518, + "time_per_iteration": 3.209568977355957 + }, + { + "auxiliary_loss_clip": 0.01516723, + "auxiliary_loss_mlp": 0.01289469, + "balance_loss_clip": 1.15988445, + "balance_loss_mlp": 1.02396584, + "epoch": 0.4231347321589611, + "flos": 17275880280960.0, + "grad_norm": 3.718502188635271, + "language_loss": 0.80410981, + "learning_rate": 2.586390324889748e-06, + "loss": 0.83217174, + "num_input_tokens_seen": 75848925, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.65625, + "step": 3519, + "time_per_iteration": 3.0182783603668213 + }, + { + "auxiliary_loss_clip": 0.01517212, + "auxiliary_loss_mlp": 0.01292551, + "balance_loss_clip": 1.16009498, + "balance_loss_mlp": 1.02685785, + "epoch": 0.4232549750496002, + "flos": 23001686297280.0, + "grad_norm": 2.0255997222915223, + "language_loss": 0.67867768, + "learning_rate": 2.5856455425531003e-06, + "loss": 0.70677531, + "num_input_tokens_seen": 75870400, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.65820312, + "step": 3520, + "time_per_iteration": 3.16491436958313 + }, + { + "auxiliary_loss_clip": 0.01517006, + "auxiliary_loss_mlp": 0.01290979, + "balance_loss_clip": 1.16020977, + "balance_loss_mlp": 1.02490354, + "epoch": 0.4233752179402393, + "flos": 21250298093760.0, + "grad_norm": 1.926537291408738, + "language_loss": 0.80536664, + "learning_rate": 2.5849006713744902e-06, + "loss": 0.8334465, + "num_input_tokens_seen": 75889195, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.66210938, + "step": 3521, + "time_per_iteration": 3.106325387954712 + }, + { + "auxiliary_loss_clip": 0.01522018, + "auxiliary_loss_mlp": 0.01302493, + "balance_loss_clip": 1.16407549, + "balance_loss_mlp": 1.03432012, + "epoch": 0.42349546083087836, + "flos": 20706402814560.0, + "grad_norm": 2.370270432066663, + "language_loss": 0.73484457, + "learning_rate": 2.5841557114669135e-06, + "loss": 0.76308972, + "num_input_tokens_seen": 75906055, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.68359375, + "step": 3522, + "time_per_iteration": 3.1111693382263184 + }, + { + "auxiliary_loss_clip": 0.01515208, + "auxiliary_loss_mlp": 0.0129969, + "balance_loss_clip": 1.15817356, + "balance_loss_mlp": 1.02922785, + "epoch": 0.42361570372151747, + "flos": 18586880246880.0, + "grad_norm": 2.9092800641127248, + "language_loss": 0.67516947, + "learning_rate": 2.58341066294338e-06, + "loss": 0.70331842, + "num_input_tokens_seen": 75922720, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.70703125, + "step": 3523, + "time_per_iteration": 3.106008291244507 + }, + { + "auxiliary_loss_clip": 0.01509537, + "auxiliary_loss_mlp": 0.01306086, + "balance_loss_clip": 1.1533432, + "balance_loss_mlp": 1.03810382, + "epoch": 0.4237359466121566, + "flos": 20961586025280.0, + "grad_norm": 2.3989114308351596, + "language_loss": 0.84939784, + "learning_rate": 2.5826655259169124e-06, + "loss": 0.877554, + "num_input_tokens_seen": 75941375, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.68164062, + "step": 3524, + "time_per_iteration": 3.0950231552124023 + }, + { + "auxiliary_loss_clip": 0.0151888, + "auxiliary_loss_mlp": 0.01296943, + "balance_loss_clip": 1.16254532, + "balance_loss_mlp": 1.03010523, + "epoch": 0.42385618950279563, + "flos": 18039988643040.0, + "grad_norm": 2.2344727383879954, + "language_loss": 0.90598428, + "learning_rate": 2.5819203005005475e-06, + "loss": 0.93414253, + "num_input_tokens_seen": 75958710, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.66992188, + "step": 3525, + "time_per_iteration": 3.176987409591675 + }, + { + "auxiliary_loss_clip": 0.01515742, + "auxiliary_loss_mlp": 0.01286493, + "balance_loss_clip": 1.1580683, + "balance_loss_mlp": 1.02118075, + "epoch": 0.42397643239343474, + "flos": 23771408027040.0, + "grad_norm": 1.6576810840684322, + "language_loss": 0.78534436, + "learning_rate": 2.581174986807336e-06, + "loss": 0.81336671, + "num_input_tokens_seen": 75978945, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.65429688, + "step": 3526, + "time_per_iteration": 3.0816562175750732 + }, + { + "auxiliary_loss_clip": 0.01517784, + "auxiliary_loss_mlp": 0.01300753, + "balance_loss_clip": 1.15896809, + "balance_loss_mlp": 1.03391516, + "epoch": 0.42409667528407385, + "flos": 16546969615680.0, + "grad_norm": 3.103015302881873, + "language_loss": 0.91345221, + "learning_rate": 2.580429584950341e-06, + "loss": 0.94163764, + "num_input_tokens_seen": 75994695, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.66992188, + "step": 3527, + "time_per_iteration": 3.1260876655578613 + }, + { + "auxiliary_loss_clip": 0.01516541, + "auxiliary_loss_mlp": 0.01288676, + "balance_loss_clip": 1.15901959, + "balance_loss_mlp": 1.02088475, + "epoch": 0.4242169181747129, + "flos": 16036034271840.0, + "grad_norm": 2.5576684639798293, + "language_loss": 0.66571999, + "learning_rate": 2.5796840950426397e-06, + "loss": 0.6937722, + "num_input_tokens_seen": 76011780, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.6796875, + "step": 3528, + "time_per_iteration": 3.0889768600463867 + }, + { + "auxiliary_loss_clip": 0.01520084, + "auxiliary_loss_mlp": 0.01290246, + "balance_loss_clip": 1.1628449, + "balance_loss_mlp": 1.02722239, + "epoch": 0.424337161065352, + "flos": 20086081564320.0, + "grad_norm": 1.8205602427348238, + "language_loss": 0.65889943, + "learning_rate": 2.578938517197322e-06, + "loss": 0.68700278, + "num_input_tokens_seen": 76029875, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.63085938, + "step": 3529, + "time_per_iteration": 3.966845750808716 + }, + { + "auxiliary_loss_clip": 0.01524994, + "auxiliary_loss_mlp": 0.01289645, + "balance_loss_clip": 1.16727102, + "balance_loss_mlp": 1.02337909, + "epoch": 0.4244574039559911, + "flos": 23880642220800.0, + "grad_norm": 3.076755410758186, + "language_loss": 0.62667894, + "learning_rate": 2.5781928515274916e-06, + "loss": 0.65482533, + "num_input_tokens_seen": 76048595, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.6640625, + "step": 3530, + "time_per_iteration": 3.89683198928833 + }, + { + "auxiliary_loss_clip": 0.0151925, + "auxiliary_loss_mlp": 0.01294614, + "balance_loss_clip": 1.15998006, + "balance_loss_mlp": 1.02834857, + "epoch": 0.4245776468466302, + "flos": 17567816243040.0, + "grad_norm": 3.265086905002624, + "language_loss": 0.68111241, + "learning_rate": 2.577447098146265e-06, + "loss": 0.70925105, + "num_input_tokens_seen": 76065770, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.6640625, + "step": 3531, + "time_per_iteration": 3.0347986221313477 + }, + { + "auxiliary_loss_clip": 0.01512678, + "auxiliary_loss_mlp": 0.01292637, + "balance_loss_clip": 1.15470243, + "balance_loss_mlp": 1.02827871, + "epoch": 0.4246978897372693, + "flos": 27778406493600.0, + "grad_norm": 2.237886198447226, + "language_loss": 0.79085845, + "learning_rate": 2.5767012571667724e-06, + "loss": 0.81891155, + "num_input_tokens_seen": 76085250, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.64453125, + "step": 3532, + "time_per_iteration": 3.996588706970215 + }, + { + "auxiliary_loss_clip": 0.01512916, + "auxiliary_loss_mlp": 0.01298646, + "balance_loss_clip": 1.15275574, + "balance_loss_mlp": 1.0297097, + "epoch": 0.42481813262790835, + "flos": 15598452718080.0, + "grad_norm": 2.8418707916180366, + "language_loss": 0.68120837, + "learning_rate": 2.5759553287021587e-06, + "loss": 0.709324, + "num_input_tokens_seen": 76103580, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.69140625, + "step": 3533, + "time_per_iteration": 3.0366055965423584 + }, + { + "auxiliary_loss_clip": 0.01519061, + "auxiliary_loss_mlp": 0.01294951, + "balance_loss_clip": 1.15983987, + "balance_loss_mlp": 1.02620602, + "epoch": 0.42493837551854746, + "flos": 23953351232160.0, + "grad_norm": 1.876330143584023, + "language_loss": 0.77338797, + "learning_rate": 2.5752093128655786e-06, + "loss": 0.8015281, + "num_input_tokens_seen": 76121825, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.68945312, + "step": 3534, + "time_per_iteration": 3.015503406524658 + }, + { + "auxiliary_loss_clip": 0.01511768, + "auxiliary_loss_mlp": 0.01290255, + "balance_loss_clip": 1.15287232, + "balance_loss_mlp": 1.02417946, + "epoch": 0.4250586184091866, + "flos": 20815561152000.0, + "grad_norm": 1.9318367633663585, + "language_loss": 0.73715812, + "learning_rate": 2.574463209770204e-06, + "loss": 0.76517832, + "num_input_tokens_seen": 76141140, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.66210938, + "step": 3535, + "time_per_iteration": 3.020291566848755 + }, + { + "auxiliary_loss_clip": 0.01506533, + "auxiliary_loss_mlp": 0.01294891, + "balance_loss_clip": 1.14673626, + "balance_loss_mlp": 1.02709889, + "epoch": 0.42517886129982563, + "flos": 30373552923840.0, + "grad_norm": 2.612805888690598, + "language_loss": 0.79975641, + "learning_rate": 2.5737170195292165e-06, + "loss": 0.82777059, + "num_input_tokens_seen": 76164475, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.6796875, + "step": 3536, + "time_per_iteration": 3.9193546772003174 + }, + { + "auxiliary_loss_clip": 0.01510267, + "auxiliary_loss_mlp": 0.01298423, + "balance_loss_clip": 1.15244341, + "balance_loss_mlp": 1.02986813, + "epoch": 0.42529910419046474, + "flos": 20082516317280.0, + "grad_norm": 13.826161051193148, + "language_loss": 0.78251666, + "learning_rate": 2.572970742255814e-06, + "loss": 0.81060356, + "num_input_tokens_seen": 76182965, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.6875, + "step": 3537, + "time_per_iteration": 3.0071401596069336 + }, + { + "auxiliary_loss_clip": 0.01507775, + "auxiliary_loss_mlp": 0.01276672, + "balance_loss_clip": 1.1503433, + "balance_loss_mlp": 1.01307642, + "epoch": 0.42541934708110385, + "flos": 22634348424480.0, + "grad_norm": 2.371352848217673, + "language_loss": 0.81838107, + "learning_rate": 2.5722243780632046e-06, + "loss": 0.84622556, + "num_input_tokens_seen": 76201230, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.63671875, + "step": 3538, + "time_per_iteration": 2.988408088684082 + }, + { + "auxiliary_loss_clip": 0.01607286, + "auxiliary_loss_mlp": 0.01193443, + "balance_loss_clip": 1.25816381, + "balance_loss_mlp": 0.97829437, + "epoch": 0.4255395899717429, + "flos": 66207298044960.0, + "grad_norm": 0.834987154398028, + "language_loss": 0.60485685, + "learning_rate": 2.5714779270646125e-06, + "loss": 0.63286418, + "num_input_tokens_seen": 76262000, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 2.15625, + "step": 3539, + "time_per_iteration": 3.457482099533081 + }, + { + "auxiliary_loss_clip": 0.01513001, + "auxiliary_loss_mlp": 0.0130388, + "balance_loss_clip": 1.15462255, + "balance_loss_mlp": 1.03456271, + "epoch": 0.425659832862382, + "flos": 17933788702080.0, + "grad_norm": 2.3833383222311144, + "language_loss": 0.77915996, + "learning_rate": 2.5707313893732735e-06, + "loss": 0.80732876, + "num_input_tokens_seen": 76280540, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.6953125, + "step": 3540, + "time_per_iteration": 3.0337979793548584 + }, + { + "auxiliary_loss_clip": 0.01502583, + "auxiliary_loss_mlp": 0.01304935, + "balance_loss_clip": 1.14366794, + "balance_loss_mlp": 1.03790665, + "epoch": 0.4257800757530211, + "flos": 24024732757920.0, + "grad_norm": 2.555326723390449, + "language_loss": 0.76862133, + "learning_rate": 2.5699847651024364e-06, + "loss": 0.79669654, + "num_input_tokens_seen": 76301180, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.671875, + "step": 3541, + "time_per_iteration": 3.0934195518493652 + }, + { + "auxiliary_loss_clip": 0.01513867, + "auxiliary_loss_mlp": 0.01292519, + "balance_loss_clip": 1.15443945, + "balance_loss_mlp": 1.02758789, + "epoch": 0.4259003186436602, + "flos": 23698281805920.0, + "grad_norm": 2.4768977000688444, + "language_loss": 0.76953447, + "learning_rate": 2.5692380543653627e-06, + "loss": 0.79759836, + "num_input_tokens_seen": 76319335, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.65039062, + "step": 3542, + "time_per_iteration": 3.1368000507354736 + }, + { + "auxiliary_loss_clip": 0.01514026, + "auxiliary_loss_mlp": 0.01304004, + "balance_loss_clip": 1.15496016, + "balance_loss_mlp": 1.03621185, + "epoch": 0.4260205615342993, + "flos": 15261116384160.0, + "grad_norm": 3.4049261083623987, + "language_loss": 0.69912195, + "learning_rate": 2.5684912572753293e-06, + "loss": 0.72730219, + "num_input_tokens_seen": 76335010, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.6796875, + "step": 3543, + "time_per_iteration": 3.0744786262512207 + }, + { + "auxiliary_loss_clip": 0.01508382, + "auxiliary_loss_mlp": 0.01295342, + "balance_loss_clip": 1.14863873, + "balance_loss_mlp": 1.02926707, + "epoch": 0.4261408044249384, + "flos": 30667992144480.0, + "grad_norm": 2.0757971653090572, + "language_loss": 0.84678906, + "learning_rate": 2.5677443739456245e-06, + "loss": 0.87482625, + "num_input_tokens_seen": 76356670, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.66210938, + "step": 3544, + "time_per_iteration": 3.149876356124878 + }, + { + "auxiliary_loss_clip": 0.01505406, + "auxiliary_loss_mlp": 0.01292233, + "balance_loss_clip": 1.14574718, + "balance_loss_mlp": 1.0269208, + "epoch": 0.42626104731557746, + "flos": 23260131329760.0, + "grad_norm": 2.7480669965461253, + "language_loss": 0.79534543, + "learning_rate": 2.5669974044895495e-06, + "loss": 0.82332182, + "num_input_tokens_seen": 76373065, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.65429688, + "step": 3545, + "time_per_iteration": 3.1066670417785645 + }, + { + "auxiliary_loss_clip": 0.01511407, + "auxiliary_loss_mlp": 0.01303101, + "balance_loss_clip": 1.15258503, + "balance_loss_mlp": 1.03607225, + "epoch": 0.42638129020621657, + "flos": 25887137778720.0, + "grad_norm": 7.901796013936226, + "language_loss": 0.79916948, + "learning_rate": 2.5662503490204187e-06, + "loss": 0.82731456, + "num_input_tokens_seen": 76393230, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.671875, + "step": 3546, + "time_per_iteration": 3.083317279815674 + }, + { + "auxiliary_loss_clip": 0.01502676, + "auxiliary_loss_mlp": 0.01292867, + "balance_loss_clip": 1.14332151, + "balance_loss_mlp": 1.02564692, + "epoch": 0.4265015330968556, + "flos": 26504614416960.0, + "grad_norm": 1.9997438786433606, + "language_loss": 0.76624805, + "learning_rate": 2.5655032076515603e-06, + "loss": 0.79420352, + "num_input_tokens_seen": 76412555, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.67382812, + "step": 3547, + "time_per_iteration": 3.059353828430176 + }, + { + "auxiliary_loss_clip": 0.01516717, + "auxiliary_loss_mlp": 0.01293888, + "balance_loss_clip": 1.15681052, + "balance_loss_mlp": 1.02666855, + "epoch": 0.42662177598749473, + "flos": 24391843061760.0, + "grad_norm": 2.135160841127381, + "language_loss": 0.82675219, + "learning_rate": 2.5647559804963155e-06, + "loss": 0.85485816, + "num_input_tokens_seen": 76432485, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.67382812, + "step": 3548, + "time_per_iteration": 3.0961875915527344 + }, + { + "auxiliary_loss_clip": 0.01505298, + "auxiliary_loss_mlp": 0.01288579, + "balance_loss_clip": 1.14616871, + "balance_loss_mlp": 1.02402949, + "epoch": 0.42674201887813384, + "flos": 23150859207840.0, + "grad_norm": 2.2530029107647533, + "language_loss": 0.78794962, + "learning_rate": 2.5640086676680364e-06, + "loss": 0.8158884, + "num_input_tokens_seen": 76453980, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.64648438, + "step": 3549, + "time_per_iteration": 3.096249580383301 + }, + { + "auxiliary_loss_clip": 0.01504971, + "auxiliary_loss_mlp": 0.01291141, + "balance_loss_clip": 1.14526629, + "balance_loss_mlp": 1.02411199, + "epoch": 0.4268622617687729, + "flos": 21691786248000.0, + "grad_norm": 2.5207294707835057, + "language_loss": 0.8067804, + "learning_rate": 2.5632612692800923e-06, + "loss": 0.83474147, + "num_input_tokens_seen": 76473045, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.671875, + "step": 3550, + "time_per_iteration": 2.9817023277282715 + }, + { + "auxiliary_loss_clip": 0.01513685, + "auxiliary_loss_mlp": 0.0129638, + "balance_loss_clip": 1.15269947, + "balance_loss_mlp": 1.02763438, + "epoch": 0.426982504659412, + "flos": 23442377960160.0, + "grad_norm": 2.664743714785403, + "language_loss": 0.7589578, + "learning_rate": 2.5625137854458603e-06, + "loss": 0.78705841, + "num_input_tokens_seen": 76492060, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.68945312, + "step": 3551, + "time_per_iteration": 3.037083625793457 + }, + { + "auxiliary_loss_clip": 0.01507418, + "auxiliary_loss_mlp": 0.01292544, + "balance_loss_clip": 1.1472826, + "balance_loss_mlp": 1.02837646, + "epoch": 0.4271027475500511, + "flos": 18918565284960.0, + "grad_norm": 1.901794931258082, + "language_loss": 0.80089718, + "learning_rate": 2.561766216278735e-06, + "loss": 0.82889676, + "num_input_tokens_seen": 76509655, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.64257812, + "step": 3552, + "time_per_iteration": 3.0725643634796143 + }, + { + "auxiliary_loss_clip": 0.01508093, + "auxiliary_loss_mlp": 0.01298446, + "balance_loss_clip": 1.14727867, + "balance_loss_mlp": 1.03408742, + "epoch": 0.4272229904406902, + "flos": 26873014278240.0, + "grad_norm": 1.9087852886274077, + "language_loss": 0.81011415, + "learning_rate": 2.561018561892121e-06, + "loss": 0.83817959, + "num_input_tokens_seen": 76528795, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.64453125, + "step": 3553, + "time_per_iteration": 3.023564100265503 + }, + { + "auxiliary_loss_clip": 0.01499038, + "auxiliary_loss_mlp": 0.01313966, + "balance_loss_clip": 1.1387217, + "balance_loss_mlp": 1.04560161, + "epoch": 0.4273432333313293, + "flos": 23953654657440.0, + "grad_norm": 2.358804869672386, + "language_loss": 0.76570201, + "learning_rate": 2.5602708223994363e-06, + "loss": 0.79383206, + "num_input_tokens_seen": 76550660, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.68554688, + "step": 3554, + "time_per_iteration": 3.0134193897247314 + }, + { + "auxiliary_loss_clip": 0.01498351, + "auxiliary_loss_mlp": 0.01291584, + "balance_loss_clip": 1.13706779, + "balance_loss_mlp": 1.02569997, + "epoch": 0.4274634762219684, + "flos": 29572957307520.0, + "grad_norm": 3.0977966610826737, + "language_loss": 0.67958724, + "learning_rate": 2.559522997914115e-06, + "loss": 0.70748657, + "num_input_tokens_seen": 76570240, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.66015625, + "step": 3555, + "time_per_iteration": 3.024934768676758 + }, + { + "auxiliary_loss_clip": 0.01506061, + "auxiliary_loss_mlp": 0.01290855, + "balance_loss_clip": 1.14562368, + "balance_loss_mlp": 1.02783167, + "epoch": 0.42758371911260745, + "flos": 21436299612000.0, + "grad_norm": 4.161327066303803, + "language_loss": 0.84353113, + "learning_rate": 2.558775088549599e-06, + "loss": 0.87150025, + "num_input_tokens_seen": 76589820, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.63085938, + "step": 3556, + "time_per_iteration": 3.168287515640259 + }, + { + "auxiliary_loss_clip": 0.01503153, + "auxiliary_loss_mlp": 0.01310844, + "balance_loss_clip": 1.14205837, + "balance_loss_mlp": 1.04228938, + "epoch": 0.42770396200324656, + "flos": 14754353137920.0, + "grad_norm": 3.624693511685147, + "language_loss": 0.66717589, + "learning_rate": 2.5580270944193467e-06, + "loss": 0.6953159, + "num_input_tokens_seen": 76606640, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.6875, + "step": 3557, + "time_per_iteration": 4.8041369915008545 + }, + { + "auxiliary_loss_clip": 0.01542317, + "auxiliary_loss_mlp": 0.01288284, + "balance_loss_clip": 1.18613029, + "balance_loss_mlp": 1.06932068, + "epoch": 0.4278242048938857, + "flos": 70661739386880.0, + "grad_norm": 0.7753313928357799, + "language_loss": 0.5546068, + "learning_rate": 2.557279015636827e-06, + "loss": 0.5829128, + "num_input_tokens_seen": 76667050, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.1953125, + "step": 3558, + "time_per_iteration": 3.4404330253601074 + }, + { + "auxiliary_loss_clip": 0.01541331, + "auxiliary_loss_mlp": 0.01248688, + "balance_loss_clip": 1.18418384, + "balance_loss_mlp": 1.02972412, + "epoch": 0.42794444778452473, + "flos": 69372472620960.0, + "grad_norm": 0.7802556872972456, + "language_loss": 0.61202943, + "learning_rate": 2.5565308523155245e-06, + "loss": 0.63992965, + "num_input_tokens_seen": 76726650, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.1953125, + "step": 3559, + "time_per_iteration": 3.3519749641418457 + }, + { + "auxiliary_loss_clip": 0.0150859, + "auxiliary_loss_mlp": 0.01290433, + "balance_loss_clip": 1.14745688, + "balance_loss_mlp": 1.02721858, + "epoch": 0.42806469067516384, + "flos": 18216849474720.0, + "grad_norm": 2.749445509575963, + "language_loss": 0.82093, + "learning_rate": 2.5557826045689336e-06, + "loss": 0.84892023, + "num_input_tokens_seen": 76742890, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.6328125, + "step": 3560, + "time_per_iteration": 3.95845627784729 + }, + { + "auxiliary_loss_clip": 0.01538555, + "auxiliary_loss_mlp": 0.0121106, + "balance_loss_clip": 1.18096256, + "balance_loss_mlp": 0.99438477, + "epoch": 0.4281849335658029, + "flos": 54542375854560.0, + "grad_norm": 0.8308898120176096, + "language_loss": 0.58795542, + "learning_rate": 2.5550342725105643e-06, + "loss": 0.61545157, + "num_input_tokens_seen": 76801055, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.171875, + "step": 3561, + "time_per_iteration": 3.3716561794281006 + }, + { + "auxiliary_loss_clip": 0.01506609, + "auxiliary_loss_mlp": 0.01294615, + "balance_loss_clip": 1.14459443, + "balance_loss_mlp": 1.02739596, + "epoch": 0.428305176456442, + "flos": 17276752628640.0, + "grad_norm": 1.8834544219032046, + "language_loss": 0.81262887, + "learning_rate": 2.554285856253937e-06, + "loss": 0.84064108, + "num_input_tokens_seen": 76819890, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.67382812, + "step": 3562, + "time_per_iteration": 3.057049512863159 + }, + { + "auxiliary_loss_clip": 0.01500164, + "auxiliary_loss_mlp": 0.0128904, + "balance_loss_clip": 1.13760018, + "balance_loss_mlp": 1.02544451, + "epoch": 0.4284254193470811, + "flos": 26361927221760.0, + "grad_norm": 1.8824815416552414, + "language_loss": 0.77565944, + "learning_rate": 2.5535373559125855e-06, + "loss": 0.80355155, + "num_input_tokens_seen": 76840255, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.63671875, + "step": 3563, + "time_per_iteration": 3.9639170169830322 + }, + { + "auxiliary_loss_clip": 0.01495256, + "auxiliary_loss_mlp": 0.01290756, + "balance_loss_clip": 1.13228726, + "balance_loss_mlp": 1.0258255, + "epoch": 0.42854566223772017, + "flos": 29716744419360.0, + "grad_norm": 1.9865898391531116, + "language_loss": 0.81889921, + "learning_rate": 2.552788771600057e-06, + "loss": 0.84675932, + "num_input_tokens_seen": 76860565, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.65039062, + "step": 3564, + "time_per_iteration": 3.234675884246826 + }, + { + "auxiliary_loss_clip": 0.01500757, + "auxiliary_loss_mlp": 0.01288128, + "balance_loss_clip": 1.13814569, + "balance_loss_mlp": 1.02357841, + "epoch": 0.4286659051283593, + "flos": 22020361176960.0, + "grad_norm": 3.41835706426928, + "language_loss": 0.81885618, + "learning_rate": 2.5520401034299118e-06, + "loss": 0.84674501, + "num_input_tokens_seen": 76878325, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.64648438, + "step": 3565, + "time_per_iteration": 3.152125835418701 + }, + { + "auxiliary_loss_clip": 0.01495525, + "auxiliary_loss_mlp": 0.01289343, + "balance_loss_clip": 1.13168216, + "balance_loss_mlp": 1.02384043, + "epoch": 0.4287861480189984, + "flos": 13336129170720.0, + "grad_norm": 2.342603053234393, + "language_loss": 0.87534618, + "learning_rate": 2.551291351515722e-06, + "loss": 0.90319484, + "num_input_tokens_seen": 76895340, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.65625, + "step": 3566, + "time_per_iteration": 3.1917872428894043 + }, + { + "auxiliary_loss_clip": 0.01495175, + "auxiliary_loss_mlp": 0.0128684, + "balance_loss_clip": 1.13212562, + "balance_loss_mlp": 1.02171898, + "epoch": 0.42890639090963745, + "flos": 26653976968320.0, + "grad_norm": 3.780729899301714, + "language_loss": 0.85979766, + "learning_rate": 2.5505425159710726e-06, + "loss": 0.88761783, + "num_input_tokens_seen": 76915150, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.65234375, + "step": 3567, + "time_per_iteration": 3.13680362701416 + }, + { + "auxiliary_loss_clip": 0.01498667, + "auxiliary_loss_mlp": 0.01282806, + "balance_loss_clip": 1.13547301, + "balance_loss_mlp": 1.0199734, + "epoch": 0.42902663380027656, + "flos": 24057768549600.0, + "grad_norm": 2.2244615699791526, + "language_loss": 0.82934505, + "learning_rate": 2.549793596909561e-06, + "loss": 0.85715973, + "num_input_tokens_seen": 76933770, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.62890625, + "step": 3568, + "time_per_iteration": 2.9753904342651367 + }, + { + "auxiliary_loss_clip": 0.01493453, + "auxiliary_loss_mlp": 0.01288456, + "balance_loss_clip": 1.13126183, + "balance_loss_mlp": 1.02676773, + "epoch": 0.42914687669091567, + "flos": 15634371049920.0, + "grad_norm": 2.8984063475430304, + "language_loss": 0.66340399, + "learning_rate": 2.5490445944447976e-06, + "loss": 0.69122308, + "num_input_tokens_seen": 76952265, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.6171875, + "step": 3569, + "time_per_iteration": 2.98512601852417 + }, + { + "auxiliary_loss_clip": 0.01500506, + "auxiliary_loss_mlp": 0.01289668, + "balance_loss_clip": 1.13656425, + "balance_loss_mlp": 1.02435565, + "epoch": 0.4292671195815547, + "flos": 31470522096960.0, + "grad_norm": 2.425919866960378, + "language_loss": 0.65302342, + "learning_rate": 2.548295508690406e-06, + "loss": 0.68092519, + "num_input_tokens_seen": 76973560, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.65429688, + "step": 3570, + "time_per_iteration": 3.0196282863616943 + }, + { + "auxiliary_loss_clip": 0.01494686, + "auxiliary_loss_mlp": 0.01299989, + "balance_loss_clip": 1.13117683, + "balance_loss_mlp": 1.0373466, + "epoch": 0.42938736247219383, + "flos": 30260070776160.0, + "grad_norm": 2.199701179182288, + "language_loss": 0.76663762, + "learning_rate": 2.5475463397600217e-06, + "loss": 0.79458439, + "num_input_tokens_seen": 76993640, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.62695312, + "step": 3571, + "time_per_iteration": 3.097928762435913 + }, + { + "auxiliary_loss_clip": 0.01500577, + "auxiliary_loss_mlp": 0.01304879, + "balance_loss_clip": 1.13705254, + "balance_loss_mlp": 1.03994858, + "epoch": 0.42950760536283294, + "flos": 29352175302240.0, + "grad_norm": 2.1498989084244555, + "language_loss": 0.77499318, + "learning_rate": 2.546797087767293e-06, + "loss": 0.80304778, + "num_input_tokens_seen": 77013765, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.65039062, + "step": 3572, + "time_per_iteration": 2.9915316104888916 + }, + { + "auxiliary_loss_clip": 0.01489918, + "auxiliary_loss_mlp": 0.01278856, + "balance_loss_clip": 1.12622237, + "balance_loss_mlp": 1.01754916, + "epoch": 0.429627848253472, + "flos": 26872369499520.0, + "grad_norm": 1.859685498246861, + "language_loss": 0.87364393, + "learning_rate": 2.546047752825881e-06, + "loss": 0.90133166, + "num_input_tokens_seen": 77034370, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.61328125, + "step": 3573, + "time_per_iteration": 2.987650156021118 + }, + { + "auxiliary_loss_clip": 0.01496161, + "auxiliary_loss_mlp": 0.01284701, + "balance_loss_clip": 1.13283944, + "balance_loss_mlp": 1.01957929, + "epoch": 0.4297480911441111, + "flos": 13882982846400.0, + "grad_norm": 2.2269019010765634, + "language_loss": 0.93472326, + "learning_rate": 2.5452983350494595e-06, + "loss": 0.96253186, + "num_input_tokens_seen": 77049925, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.65234375, + "step": 3574, + "time_per_iteration": 3.0403308868408203 + }, + { + "auxiliary_loss_clip": 0.01499171, + "auxiliary_loss_mlp": 0.01288589, + "balance_loss_clip": 1.13646531, + "balance_loss_mlp": 1.02442098, + "epoch": 0.4298683340347502, + "flos": 20743269350400.0, + "grad_norm": 2.9559423947297674, + "language_loss": 0.65231383, + "learning_rate": 2.544548834551713e-06, + "loss": 0.6801914, + "num_input_tokens_seen": 77068930, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.64257812, + "step": 3575, + "time_per_iteration": 2.950861692428589 + }, + { + "auxiliary_loss_clip": 0.01499233, + "auxiliary_loss_mlp": 0.01287488, + "balance_loss_clip": 1.1364218, + "balance_loss_mlp": 1.02351046, + "epoch": 0.4299885769253893, + "flos": 20883984281280.0, + "grad_norm": 2.8939851547525306, + "language_loss": 0.94517463, + "learning_rate": 2.5437992514463424e-06, + "loss": 0.97304177, + "num_input_tokens_seen": 77082255, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.640625, + "step": 3576, + "time_per_iteration": 2.9710659980773926 + }, + { + "auxiliary_loss_clip": 0.01494842, + "auxiliary_loss_mlp": 0.01296606, + "balance_loss_clip": 1.13205016, + "balance_loss_mlp": 1.0299592, + "epoch": 0.4301088198160284, + "flos": 25487029611360.0, + "grad_norm": 1.8138256285661674, + "language_loss": 0.88127613, + "learning_rate": 2.5430495858470565e-06, + "loss": 0.9091906, + "num_input_tokens_seen": 77101725, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.66796875, + "step": 3577, + "time_per_iteration": 3.1059160232543945 + }, + { + "auxiliary_loss_clip": 0.01491377, + "auxiliary_loss_mlp": 0.01305656, + "balance_loss_clip": 1.12804341, + "balance_loss_mlp": 1.03843665, + "epoch": 0.43022906270666744, + "flos": 18261377498880.0, + "grad_norm": 2.4076555010171905, + "language_loss": 0.77484709, + "learning_rate": 2.54229983786758e-06, + "loss": 0.80281746, + "num_input_tokens_seen": 77119670, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.67382812, + "step": 3578, + "time_per_iteration": 3.112070322036743 + }, + { + "auxiliary_loss_clip": 0.01495988, + "auxiliary_loss_mlp": 0.01296574, + "balance_loss_clip": 1.13288319, + "balance_loss_mlp": 1.03088045, + "epoch": 0.43034930559730655, + "flos": 23401528967520.0, + "grad_norm": 1.9503446546987384, + "language_loss": 0.84800303, + "learning_rate": 2.541550007621651e-06, + "loss": 0.8759287, + "num_input_tokens_seen": 77138160, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.65820312, + "step": 3579, + "time_per_iteration": 3.1198885440826416 + }, + { + "auxiliary_loss_clip": 0.01500574, + "auxiliary_loss_mlp": 0.01302998, + "balance_loss_clip": 1.13748384, + "balance_loss_mlp": 1.03692317, + "epoch": 0.43046954848794566, + "flos": 28186934712480.0, + "grad_norm": 1.9355212296816768, + "language_loss": 0.7993452, + "learning_rate": 2.5408000952230156e-06, + "loss": 0.8273809, + "num_input_tokens_seen": 77156950, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.66210938, + "step": 3580, + "time_per_iteration": 3.1310837268829346 + }, + { + "auxiliary_loss_clip": 0.01493781, + "auxiliary_loss_mlp": 0.01302364, + "balance_loss_clip": 1.13093996, + "balance_loss_mlp": 1.03419077, + "epoch": 0.4305897913785847, + "flos": 28582756997760.0, + "grad_norm": 3.9248705159336357, + "language_loss": 0.90490174, + "learning_rate": 2.5400501007854357e-06, + "loss": 0.93286324, + "num_input_tokens_seen": 77176395, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.68359375, + "step": 3581, + "time_per_iteration": 3.1223413944244385 + }, + { + "auxiliary_loss_clip": 0.01492941, + "auxiliary_loss_mlp": 0.01302746, + "balance_loss_clip": 1.12874556, + "balance_loss_mlp": 1.03800583, + "epoch": 0.43071003426922383, + "flos": 20450802394080.0, + "grad_norm": 1.8819159681230408, + "language_loss": 0.75470638, + "learning_rate": 2.539300024422685e-06, + "loss": 0.78266335, + "num_input_tokens_seen": 77194340, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.6484375, + "step": 3582, + "time_per_iteration": 3.0042941570281982 + }, + { + "auxiliary_loss_clip": 0.01516491, + "auxiliary_loss_mlp": 0.0121006, + "balance_loss_clip": 1.15912461, + "balance_loss_mlp": 0.99796295, + "epoch": 0.43083027715986294, + "flos": 52003250045280.0, + "grad_norm": 0.8230066029064685, + "language_loss": 0.60863781, + "learning_rate": 2.538549866248549e-06, + "loss": 0.63590336, + "num_input_tokens_seen": 77249320, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.125, + "step": 3583, + "time_per_iteration": 3.337571382522583 + }, + { + "auxiliary_loss_clip": 0.0149391, + "auxiliary_loss_mlp": 0.01299565, + "balance_loss_clip": 1.13049304, + "balance_loss_mlp": 1.03215456, + "epoch": 0.430950520050502, + "flos": 16692766920000.0, + "grad_norm": 2.1276322844247786, + "language_loss": 0.81202799, + "learning_rate": 2.5377996263768274e-06, + "loss": 0.83996272, + "num_input_tokens_seen": 77267400, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.67578125, + "step": 3584, + "time_per_iteration": 3.8449759483337402 + }, + { + "auxiliary_loss_clip": 0.01503012, + "auxiliary_loss_mlp": 0.01307175, + "balance_loss_clip": 1.13958967, + "balance_loss_mlp": 1.03995514, + "epoch": 0.4310707629411411, + "flos": 24610766587200.0, + "grad_norm": 1.7625132939142811, + "language_loss": 0.68707323, + "learning_rate": 2.5370493049213293e-06, + "loss": 0.71517509, + "num_input_tokens_seen": 77287045, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.67382812, + "step": 3585, + "time_per_iteration": 4.055643081665039 + }, + { + "auxiliary_loss_clip": 0.01491546, + "auxiliary_loss_mlp": 0.01310386, + "balance_loss_clip": 1.12786257, + "balance_loss_mlp": 1.04164052, + "epoch": 0.4311910058317802, + "flos": 26435432724480.0, + "grad_norm": 2.2558386891626663, + "language_loss": 0.80155003, + "learning_rate": 2.536298901995878e-06, + "loss": 0.82956934, + "num_input_tokens_seen": 77306255, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.68945312, + "step": 3586, + "time_per_iteration": 3.0604124069213867 + }, + { + "auxiliary_loss_clip": 0.01498995, + "auxiliary_loss_mlp": 0.01307049, + "balance_loss_clip": 1.13489318, + "balance_loss_mlp": 1.03982925, + "epoch": 0.43131124872241927, + "flos": 25158265041600.0, + "grad_norm": 1.8585303102848072, + "language_loss": 0.80171686, + "learning_rate": 2.535548417714311e-06, + "loss": 0.8297773, + "num_input_tokens_seen": 77325555, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.67382812, + "step": 3587, + "time_per_iteration": 3.0344314575195312 + }, + { + "auxiliary_loss_clip": 0.01491311, + "auxiliary_loss_mlp": 0.01310035, + "balance_loss_clip": 1.12802696, + "balance_loss_mlp": 1.04090774, + "epoch": 0.4314314916130584, + "flos": 21616877403360.0, + "grad_norm": 1.7115388393537032, + "language_loss": 0.87299359, + "learning_rate": 2.534797852190474e-06, + "loss": 0.90100706, + "num_input_tokens_seen": 77345735, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.69335938, + "step": 3588, + "time_per_iteration": 3.9170336723327637 + }, + { + "auxiliary_loss_clip": 0.01496794, + "auxiliary_loss_mlp": 0.01303773, + "balance_loss_clip": 1.13390541, + "balance_loss_mlp": 1.03865123, + "epoch": 0.4315517345036975, + "flos": 19276686614880.0, + "grad_norm": 1.99844104543032, + "language_loss": 0.81903958, + "learning_rate": 2.5340472055382283e-06, + "loss": 0.84704524, + "num_input_tokens_seen": 77361765, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.65234375, + "step": 3589, + "time_per_iteration": 3.9276514053344727 + }, + { + "auxiliary_loss_clip": 0.01498842, + "auxiliary_loss_mlp": 0.01315738, + "balance_loss_clip": 1.13505757, + "balance_loss_mlp": 1.04851818, + "epoch": 0.43167197739433655, + "flos": 24275819727360.0, + "grad_norm": 2.028350236403742, + "language_loss": 0.80829406, + "learning_rate": 2.5332964778714468e-06, + "loss": 0.83643985, + "num_input_tokens_seen": 77378950, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.67382812, + "step": 3590, + "time_per_iteration": 3.0861077308654785 + }, + { + "auxiliary_loss_clip": 0.01504383, + "auxiliary_loss_mlp": 0.01301707, + "balance_loss_clip": 1.14159799, + "balance_loss_mlp": 1.03620362, + "epoch": 0.43179222028497566, + "flos": 16869400182720.0, + "grad_norm": 2.109880151545376, + "language_loss": 0.66090763, + "learning_rate": 2.5325456693040123e-06, + "loss": 0.68896854, + "num_input_tokens_seen": 77396145, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.65625, + "step": 3591, + "time_per_iteration": 3.092421054840088 + }, + { + "auxiliary_loss_clip": 0.01497021, + "auxiliary_loss_mlp": 0.01315059, + "balance_loss_clip": 1.13491285, + "balance_loss_mlp": 1.04402423, + "epoch": 0.43191246317561477, + "flos": 17641132104960.0, + "grad_norm": 2.564843176728542, + "language_loss": 0.75036526, + "learning_rate": 2.531794779949824e-06, + "loss": 0.77848613, + "num_input_tokens_seen": 77414045, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.71289062, + "step": 3592, + "time_per_iteration": 2.9846115112304688 + }, + { + "auxiliary_loss_clip": 0.01500758, + "auxiliary_loss_mlp": 0.01309036, + "balance_loss_clip": 1.13911557, + "balance_loss_mlp": 1.04219818, + "epoch": 0.4320327060662538, + "flos": 23881097358720.0, + "grad_norm": 2.0345575948026617, + "language_loss": 0.8840012, + "learning_rate": 2.5310438099227903e-06, + "loss": 0.91209912, + "num_input_tokens_seen": 77431310, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.66992188, + "step": 3593, + "time_per_iteration": 3.011871576309204 + }, + { + "auxiliary_loss_clip": 0.01540295, + "auxiliary_loss_mlp": 0.01221436, + "balance_loss_clip": 1.1849556, + "balance_loss_mlp": 1.00628662, + "epoch": 0.43215294895689293, + "flos": 66402212680800.0, + "grad_norm": 0.8406200913408871, + "language_loss": 0.53368086, + "learning_rate": 2.530292759336833e-06, + "loss": 0.56129813, + "num_input_tokens_seen": 77492045, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.15625, + "step": 3594, + "time_per_iteration": 3.5575029850006104 + }, + { + "auxiliary_loss_clip": 0.01494875, + "auxiliary_loss_mlp": 0.01299249, + "balance_loss_clip": 1.13053107, + "balance_loss_mlp": 1.03126645, + "epoch": 0.432273191847532, + "flos": 20596334201280.0, + "grad_norm": 2.71634502640844, + "language_loss": 0.6942693, + "learning_rate": 2.5295416283058855e-06, + "loss": 0.72221053, + "num_input_tokens_seen": 77510910, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.68164062, + "step": 3595, + "time_per_iteration": 2.9641191959381104 + }, + { + "auxiliary_loss_clip": 0.01503313, + "auxiliary_loss_mlp": 0.01294438, + "balance_loss_clip": 1.14165688, + "balance_loss_mlp": 1.0310334, + "epoch": 0.4323934347381711, + "flos": 19284196390560.0, + "grad_norm": 1.7259585512912492, + "language_loss": 0.66344678, + "learning_rate": 2.5287904169438943e-06, + "loss": 0.69142431, + "num_input_tokens_seen": 77530115, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.63476562, + "step": 3596, + "time_per_iteration": 3.011606216430664 + }, + { + "auxiliary_loss_clip": 0.01504744, + "auxiliary_loss_mlp": 0.01304041, + "balance_loss_clip": 1.14218998, + "balance_loss_mlp": 1.03453255, + "epoch": 0.4325136776288102, + "flos": 21728501071200.0, + "grad_norm": 2.8285452671440665, + "language_loss": 0.64636409, + "learning_rate": 2.528039125364817e-06, + "loss": 0.67445195, + "num_input_tokens_seen": 77548920, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.69726562, + "step": 3597, + "time_per_iteration": 3.0881733894348145 + }, + { + "auxiliary_loss_clip": 0.01499461, + "auxiliary_loss_mlp": 0.01303884, + "balance_loss_clip": 1.13607407, + "balance_loss_mlp": 1.03742754, + "epoch": 0.43263392051944927, + "flos": 22342640031360.0, + "grad_norm": 2.5996613400174557, + "language_loss": 0.75995809, + "learning_rate": 2.5272877536826246e-06, + "loss": 0.78799152, + "num_input_tokens_seen": 77567715, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.66601562, + "step": 3598, + "time_per_iteration": 3.0118839740753174 + }, + { + "auxiliary_loss_clip": 0.0149266, + "auxiliary_loss_mlp": 0.01289542, + "balance_loss_clip": 1.12965965, + "balance_loss_mlp": 1.02384806, + "epoch": 0.4327541634100884, + "flos": 29170952732160.0, + "grad_norm": 2.5172219916060037, + "language_loss": 0.70546031, + "learning_rate": 2.5265363020112986e-06, + "loss": 0.73328233, + "num_input_tokens_seen": 77588035, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.65820312, + "step": 3599, + "time_per_iteration": 3.073413372039795 + }, + { + "auxiliary_loss_clip": 0.0149717, + "auxiliary_loss_mlp": 0.01299814, + "balance_loss_clip": 1.1340884, + "balance_loss_mlp": 1.03469276, + "epoch": 0.4328744063007275, + "flos": 26069839547040.0, + "grad_norm": 1.900819012258322, + "language_loss": 0.83921939, + "learning_rate": 2.5257847704648344e-06, + "loss": 0.86718917, + "num_input_tokens_seen": 77609265, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.65234375, + "step": 3600, + "time_per_iteration": 3.142732620239258 + }, + { + "auxiliary_loss_clip": 0.01498902, + "auxiliary_loss_mlp": 0.0128172, + "balance_loss_clip": 1.13499188, + "balance_loss_mlp": 1.01659846, + "epoch": 0.43299464919136654, + "flos": 16583229300960.0, + "grad_norm": 2.231110979417506, + "language_loss": 0.75360131, + "learning_rate": 2.525033159157239e-06, + "loss": 0.78140754, + "num_input_tokens_seen": 77625580, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.65234375, + "step": 3601, + "time_per_iteration": 3.0960357189178467 + }, + { + "auxiliary_loss_clip": 0.01493374, + "auxiliary_loss_mlp": 0.01288928, + "balance_loss_clip": 1.12928247, + "balance_loss_mlp": 1.02399683, + "epoch": 0.43311489208200565, + "flos": 16109122564800.0, + "grad_norm": 2.354103588398275, + "language_loss": 0.76973677, + "learning_rate": 2.52428146820253e-06, + "loss": 0.79755974, + "num_input_tokens_seen": 77643835, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.65039062, + "step": 3602, + "time_per_iteration": 3.041900634765625 + }, + { + "auxiliary_loss_clip": 0.01496271, + "auxiliary_loss_mlp": 0.01295613, + "balance_loss_clip": 1.1314857, + "balance_loss_mlp": 1.03030133, + "epoch": 0.43323513497264476, + "flos": 22932466676640.0, + "grad_norm": 2.342901429111724, + "language_loss": 0.81906873, + "learning_rate": 2.52352969771474e-06, + "loss": 0.84698755, + "num_input_tokens_seen": 77663060, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.65429688, + "step": 3603, + "time_per_iteration": 3.0402326583862305 + }, + { + "auxiliary_loss_clip": 0.01503913, + "auxiliary_loss_mlp": 0.01294453, + "balance_loss_clip": 1.14128184, + "balance_loss_mlp": 1.02742434, + "epoch": 0.4333553778632838, + "flos": 25301255662080.0, + "grad_norm": 1.9645580482700549, + "language_loss": 0.88133979, + "learning_rate": 2.5227778478079106e-06, + "loss": 0.90932345, + "num_input_tokens_seen": 77682470, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.671875, + "step": 3604, + "time_per_iteration": 3.1521780490875244 + }, + { + "auxiliary_loss_clip": 0.01500676, + "auxiliary_loss_mlp": 0.01288877, + "balance_loss_clip": 1.13739729, + "balance_loss_mlp": 1.02451849, + "epoch": 0.43347562075392293, + "flos": 19388992989600.0, + "grad_norm": 1.8045461927019746, + "language_loss": 0.77221143, + "learning_rate": 2.522025918596098e-06, + "loss": 0.800107, + "num_input_tokens_seen": 77700770, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.64453125, + "step": 3605, + "time_per_iteration": 3.1397743225097656 + }, + { + "auxiliary_loss_clip": 0.01499516, + "auxiliary_loss_mlp": 0.01285047, + "balance_loss_clip": 1.1345911, + "balance_loss_mlp": 1.02354968, + "epoch": 0.43359586364456204, + "flos": 26328550076640.0, + "grad_norm": 1.5236916891193553, + "language_loss": 0.65591919, + "learning_rate": 2.521273910193368e-06, + "loss": 0.68376487, + "num_input_tokens_seen": 77723950, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.61523438, + "step": 3606, + "time_per_iteration": 3.132615566253662 + }, + { + "auxiliary_loss_clip": 0.01499323, + "auxiliary_loss_mlp": 0.01293081, + "balance_loss_clip": 1.13486803, + "balance_loss_mlp": 1.02834153, + "epoch": 0.4337161065352011, + "flos": 15990633900000.0, + "grad_norm": 2.7692237323563336, + "language_loss": 0.87354201, + "learning_rate": 2.5205218227138006e-06, + "loss": 0.90146601, + "num_input_tokens_seen": 77736905, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.6484375, + "step": 3607, + "time_per_iteration": 3.092743158340454 + }, + { + "auxiliary_loss_clip": 0.01496752, + "auxiliary_loss_mlp": 0.01288696, + "balance_loss_clip": 1.13217664, + "balance_loss_mlp": 1.0256722, + "epoch": 0.4338363494258402, + "flos": 20226493069920.0, + "grad_norm": 2.211342715284625, + "language_loss": 0.79402685, + "learning_rate": 2.519769656271486e-06, + "loss": 0.82188129, + "num_input_tokens_seen": 77754325, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.63085938, + "step": 3608, + "time_per_iteration": 3.000870943069458 + }, + { + "auxiliary_loss_clip": 0.01497844, + "auxiliary_loss_mlp": 0.01287864, + "balance_loss_clip": 1.13390732, + "balance_loss_mlp": 1.02350545, + "epoch": 0.43395659231647926, + "flos": 20085967779840.0, + "grad_norm": 2.227308715015345, + "language_loss": 0.67724276, + "learning_rate": 2.5190174109805285e-06, + "loss": 0.70509982, + "num_input_tokens_seen": 77774150, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.64453125, + "step": 3609, + "time_per_iteration": 2.9741814136505127 + }, + { + "auxiliary_loss_clip": 0.01492662, + "auxiliary_loss_mlp": 0.01280147, + "balance_loss_clip": 1.12795234, + "balance_loss_mlp": 1.01617014, + "epoch": 0.43407683520711837, + "flos": 19903872862080.0, + "grad_norm": 2.0412306410583767, + "language_loss": 0.64111578, + "learning_rate": 2.518265086955042e-06, + "loss": 0.66884387, + "num_input_tokens_seen": 77791870, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.640625, + "step": 3610, + "time_per_iteration": 3.0751543045043945 + }, + { + "auxiliary_loss_clip": 0.01496405, + "auxiliary_loss_mlp": 0.01277303, + "balance_loss_clip": 1.13094115, + "balance_loss_mlp": 1.0137074, + "epoch": 0.4341970780977575, + "flos": 23110768778400.0, + "grad_norm": 1.8221685645552912, + "language_loss": 0.83680487, + "learning_rate": 2.5175126843091534e-06, + "loss": 0.86454195, + "num_input_tokens_seen": 77811240, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.63671875, + "step": 3611, + "time_per_iteration": 2.96351957321167 + }, + { + "auxiliary_loss_clip": 0.01499415, + "auxiliary_loss_mlp": 0.01291487, + "balance_loss_clip": 1.13536084, + "balance_loss_mlp": 1.02789152, + "epoch": 0.43431732098839654, + "flos": 37410965756640.0, + "grad_norm": 2.332807339784554, + "language_loss": 0.75773352, + "learning_rate": 2.5167602031570034e-06, + "loss": 0.78564256, + "num_input_tokens_seen": 77831425, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.63671875, + "step": 3612, + "time_per_iteration": 3.9222919940948486 + }, + { + "auxiliary_loss_clip": 0.01500329, + "auxiliary_loss_mlp": 0.01286047, + "balance_loss_clip": 1.13696647, + "balance_loss_mlp": 1.0224514, + "epoch": 0.43443756387903565, + "flos": 31870895761440.0, + "grad_norm": 1.701293746015582, + "language_loss": 0.7362088, + "learning_rate": 2.51600764361274e-06, + "loss": 0.76407254, + "num_input_tokens_seen": 77852950, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.63671875, + "step": 3613, + "time_per_iteration": 3.9903159141540527 + }, + { + "auxiliary_loss_clip": 0.01502715, + "auxiliary_loss_mlp": 0.01294213, + "balance_loss_clip": 1.13817441, + "balance_loss_mlp": 1.03214347, + "epoch": 0.43455780676967476, + "flos": 23479396208640.0, + "grad_norm": 3.1584721072009483, + "language_loss": 0.79087293, + "learning_rate": 2.5152550057905283e-06, + "loss": 0.81884217, + "num_input_tokens_seen": 77872840, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.62109375, + "step": 3614, + "time_per_iteration": 2.9897899627685547 + }, + { + "auxiliary_loss_clip": 0.01500806, + "auxiliary_loss_mlp": 0.012871, + "balance_loss_clip": 1.1366291, + "balance_loss_mlp": 1.01892698, + "epoch": 0.4346780496603138, + "flos": 24209368862400.0, + "grad_norm": 2.815225528675845, + "language_loss": 0.76990777, + "learning_rate": 2.5145022898045415e-06, + "loss": 0.79778677, + "num_input_tokens_seen": 77892025, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.68359375, + "step": 3615, + "time_per_iteration": 2.9928619861602783 + }, + { + "auxiliary_loss_clip": 0.01500917, + "auxiliary_loss_mlp": 0.01292044, + "balance_loss_clip": 1.13648248, + "balance_loss_mlp": 1.0263505, + "epoch": 0.4347982925509529, + "flos": 17094278429280.0, + "grad_norm": 2.3642712020483585, + "language_loss": 0.89667028, + "learning_rate": 2.5137494957689664e-06, + "loss": 0.92459989, + "num_input_tokens_seen": 77907635, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.65820312, + "step": 3616, + "time_per_iteration": 3.822894811630249 + }, + { + "auxiliary_loss_clip": 0.01558882, + "auxiliary_loss_mlp": 0.01206459, + "balance_loss_clip": 1.20267415, + "balance_loss_mlp": 0.99131012, + "epoch": 0.43491853544159204, + "flos": 60951805948800.0, + "grad_norm": 0.7669157618240027, + "language_loss": 0.57340419, + "learning_rate": 2.5129966237980016e-06, + "loss": 0.60105759, + "num_input_tokens_seen": 77970630, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.15625, + "step": 3617, + "time_per_iteration": 3.553612232208252 + }, + { + "auxiliary_loss_clip": 0.01502579, + "auxiliary_loss_mlp": 0.01286982, + "balance_loss_clip": 1.13777614, + "balance_loss_mlp": 1.02395904, + "epoch": 0.4350387783322311, + "flos": 21946817746080.0, + "grad_norm": 1.8538630537996776, + "language_loss": 0.78296351, + "learning_rate": 2.512243674005857e-06, + "loss": 0.81085914, + "num_input_tokens_seen": 77989995, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.63085938, + "step": 3618, + "time_per_iteration": 3.908078193664551 + }, + { + "auxiliary_loss_clip": 0.01501919, + "auxiliary_loss_mlp": 0.01293551, + "balance_loss_clip": 1.13850594, + "balance_loss_mlp": 1.02938366, + "epoch": 0.4351590212228702, + "flos": 25085062964160.0, + "grad_norm": 2.4260735769836184, + "language_loss": 0.86472368, + "learning_rate": 2.5114906465067537e-06, + "loss": 0.89267838, + "num_input_tokens_seen": 78010980, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.64257812, + "step": 3619, + "time_per_iteration": 3.130174160003662 + }, + { + "auxiliary_loss_clip": 0.01499274, + "auxiliary_loss_mlp": 0.01288744, + "balance_loss_clip": 1.13482702, + "balance_loss_mlp": 1.02667427, + "epoch": 0.4352792641135093, + "flos": 21508667269920.0, + "grad_norm": 2.0243438672022553, + "language_loss": 0.75483131, + "learning_rate": 2.5107375414149264e-06, + "loss": 0.78271157, + "num_input_tokens_seen": 78030225, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.62109375, + "step": 3620, + "time_per_iteration": 3.1217198371887207 + }, + { + "auxiliary_loss_clip": 0.0149311, + "auxiliary_loss_mlp": 0.0129728, + "balance_loss_clip": 1.12861776, + "balance_loss_mlp": 1.03006101, + "epoch": 0.43539950700414837, + "flos": 16255337078880.0, + "grad_norm": 2.1895197318445327, + "language_loss": 0.71747231, + "learning_rate": 2.5099843588446197e-06, + "loss": 0.74537617, + "num_input_tokens_seen": 78048545, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.67382812, + "step": 3621, + "time_per_iteration": 3.0587730407714844 + }, + { + "auxiliary_loss_clip": 0.01499705, + "auxiliary_loss_mlp": 0.01312516, + "balance_loss_clip": 1.13485265, + "balance_loss_mlp": 1.0479666, + "epoch": 0.4355197498947875, + "flos": 16693753052160.0, + "grad_norm": 1.6651900476279045, + "language_loss": 0.61695063, + "learning_rate": 2.509231098910091e-06, + "loss": 0.64507282, + "num_input_tokens_seen": 78068415, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.64648438, + "step": 3622, + "time_per_iteration": 3.1331284046173096 + }, + { + "auxiliary_loss_clip": 0.0150205, + "auxiliary_loss_mlp": 0.01307653, + "balance_loss_clip": 1.13579297, + "balance_loss_mlp": 1.04329491, + "epoch": 0.4356399927854266, + "flos": 16364609200800.0, + "grad_norm": 2.841372771986424, + "language_loss": 0.75245965, + "learning_rate": 2.508477761725611e-06, + "loss": 0.78055668, + "num_input_tokens_seen": 78086690, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.64453125, + "step": 3623, + "time_per_iteration": 3.106046199798584 + }, + { + "auxiliary_loss_clip": 0.01493819, + "auxiliary_loss_mlp": 0.0129988, + "balance_loss_clip": 1.12891531, + "balance_loss_mlp": 1.03494906, + "epoch": 0.43576023567606564, + "flos": 17203967760960.0, + "grad_norm": 1.8347591752515173, + "language_loss": 0.80851334, + "learning_rate": 2.507724347405458e-06, + "loss": 0.83645034, + "num_input_tokens_seen": 78104640, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.65039062, + "step": 3624, + "time_per_iteration": 3.1096153259277344 + }, + { + "auxiliary_loss_clip": 0.01495611, + "auxiliary_loss_mlp": 0.01293563, + "balance_loss_clip": 1.13111305, + "balance_loss_mlp": 1.03053951, + "epoch": 0.43588047856670475, + "flos": 15919100661600.0, + "grad_norm": 1.8860370387663068, + "language_loss": 0.8178916, + "learning_rate": 2.5069708560639243e-06, + "loss": 0.84578335, + "num_input_tokens_seen": 78122550, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.63085938, + "step": 3625, + "time_per_iteration": 3.022904872894287 + }, + { + "auxiliary_loss_clip": 0.01498778, + "auxiliary_loss_mlp": 0.01300385, + "balance_loss_clip": 1.13438225, + "balance_loss_mlp": 1.03392839, + "epoch": 0.4360007214573438, + "flos": 23661832479840.0, + "grad_norm": 2.169809190205939, + "language_loss": 0.61859322, + "learning_rate": 2.5062172878153158e-06, + "loss": 0.64658487, + "num_input_tokens_seen": 78141825, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.66601562, + "step": 3626, + "time_per_iteration": 3.0796027183532715 + }, + { + "auxiliary_loss_clip": 0.01495798, + "auxiliary_loss_mlp": 0.01295996, + "balance_loss_clip": 1.13158488, + "balance_loss_mlp": 1.03144646, + "epoch": 0.4361209643479829, + "flos": 21980801741760.0, + "grad_norm": 7.949499066790288, + "language_loss": 0.87529492, + "learning_rate": 2.505463642773947e-06, + "loss": 0.90321285, + "num_input_tokens_seen": 78161790, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.64648438, + "step": 3627, + "time_per_iteration": 2.985008955001831 + }, + { + "auxiliary_loss_clip": 0.01496345, + "auxiliary_loss_mlp": 0.0129695, + "balance_loss_clip": 1.1317327, + "balance_loss_mlp": 1.03316355, + "epoch": 0.43624120723862203, + "flos": 17422056866880.0, + "grad_norm": 2.4783572163641727, + "language_loss": 0.75488132, + "learning_rate": 2.504709921054146e-06, + "loss": 0.78281432, + "num_input_tokens_seen": 78178605, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.63867188, + "step": 3628, + "time_per_iteration": 3.00761079788208 + }, + { + "auxiliary_loss_clip": 0.01492591, + "auxiliary_loss_mlp": 0.01302066, + "balance_loss_clip": 1.12747908, + "balance_loss_mlp": 1.03618133, + "epoch": 0.4363614501292611, + "flos": 17897263519680.0, + "grad_norm": 2.3737844992059878, + "language_loss": 0.83910316, + "learning_rate": 2.50395612277025e-06, + "loss": 0.86704969, + "num_input_tokens_seen": 78194460, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.66015625, + "step": 3629, + "time_per_iteration": 2.9554452896118164 + }, + { + "auxiliary_loss_clip": 0.01494971, + "auxiliary_loss_mlp": 0.01289109, + "balance_loss_clip": 1.12873542, + "balance_loss_mlp": 1.02494121, + "epoch": 0.4364816930199002, + "flos": 20304967161600.0, + "grad_norm": 2.3386794224477048, + "language_loss": 0.7311489, + "learning_rate": 2.503202248036612e-06, + "loss": 0.75898969, + "num_input_tokens_seen": 78213315, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.64257812, + "step": 3630, + "time_per_iteration": 2.997637987136841 + }, + { + "auxiliary_loss_clip": 0.01501687, + "auxiliary_loss_mlp": 0.01291137, + "balance_loss_clip": 1.13675666, + "balance_loss_mlp": 1.02792239, + "epoch": 0.4366019359105393, + "flos": 24063647414400.0, + "grad_norm": 5.632498985685107, + "language_loss": 0.73685247, + "learning_rate": 2.5024482969675927e-06, + "loss": 0.76478064, + "num_input_tokens_seen": 78233270, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.6328125, + "step": 3631, + "time_per_iteration": 3.046513795852661 + }, + { + "auxiliary_loss_clip": 0.01500831, + "auxiliary_loss_mlp": 0.01292533, + "balance_loss_clip": 1.13681364, + "balance_loss_mlp": 1.02970052, + "epoch": 0.43672217880117836, + "flos": 21755809710720.0, + "grad_norm": 2.165023878766846, + "language_loss": 0.84554517, + "learning_rate": 2.501694269677566e-06, + "loss": 0.87347877, + "num_input_tokens_seen": 78251040, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.62890625, + "step": 3632, + "time_per_iteration": 2.989899158477783 + }, + { + "auxiliary_loss_clip": 0.01498683, + "auxiliary_loss_mlp": 0.01289872, + "balance_loss_clip": 1.13382101, + "balance_loss_mlp": 1.02570438, + "epoch": 0.4368424216918175, + "flos": 18036840605760.0, + "grad_norm": 2.02439798022572, + "language_loss": 0.81017387, + "learning_rate": 2.500940166280918e-06, + "loss": 0.83805943, + "num_input_tokens_seen": 78269470, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.64257812, + "step": 3633, + "time_per_iteration": 2.990469455718994 + }, + { + "auxiliary_loss_clip": 0.01507107, + "auxiliary_loss_mlp": 0.01281425, + "balance_loss_clip": 1.14294481, + "balance_loss_mlp": 1.0189743, + "epoch": 0.4369626645824566, + "flos": 25449480368640.0, + "grad_norm": 1.9770824746795819, + "language_loss": 0.79042149, + "learning_rate": 2.500185986892045e-06, + "loss": 0.8183068, + "num_input_tokens_seen": 78288955, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.625, + "step": 3634, + "time_per_iteration": 3.09468674659729 + }, + { + "auxiliary_loss_clip": 0.01500139, + "auxiliary_loss_mlp": 0.01300322, + "balance_loss_clip": 1.1342864, + "balance_loss_mlp": 1.03500938, + "epoch": 0.43708290747309564, + "flos": 25305124334400.0, + "grad_norm": 2.7353561601119587, + "language_loss": 0.77043086, + "learning_rate": 2.499431731625355e-06, + "loss": 0.79843545, + "num_input_tokens_seen": 78307980, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.65429688, + "step": 3635, + "time_per_iteration": 3.1141557693481445 + }, + { + "auxiliary_loss_clip": 0.01496268, + "auxiliary_loss_mlp": 0.01297782, + "balance_loss_clip": 1.13182986, + "balance_loss_mlp": 1.03208852, + "epoch": 0.43720315036373475, + "flos": 31577708170080.0, + "grad_norm": 2.009803820163697, + "language_loss": 0.79698241, + "learning_rate": 2.4986774005952686e-06, + "loss": 0.82492292, + "num_input_tokens_seen": 78330355, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.65820312, + "step": 3636, + "time_per_iteration": 3.067988157272339 + }, + { + "auxiliary_loss_clip": 0.01505095, + "auxiliary_loss_mlp": 0.01293325, + "balance_loss_clip": 1.14010954, + "balance_loss_mlp": 1.03087354, + "epoch": 0.43732339325437386, + "flos": 23114220240960.0, + "grad_norm": 2.1183421153371413, + "language_loss": 0.84944528, + "learning_rate": 2.4979229939162166e-06, + "loss": 0.87742949, + "num_input_tokens_seen": 78349135, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.625, + "step": 3637, + "time_per_iteration": 3.0671346187591553 + }, + { + "auxiliary_loss_clip": 0.01503405, + "auxiliary_loss_mlp": 0.01283554, + "balance_loss_clip": 1.13908339, + "balance_loss_mlp": 1.02205622, + "epoch": 0.4374436361450129, + "flos": 27748784236320.0, + "grad_norm": 1.9094935476985555, + "language_loss": 0.80641174, + "learning_rate": 2.4971685117026433e-06, + "loss": 0.83428133, + "num_input_tokens_seen": 78368900, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.61523438, + "step": 3638, + "time_per_iteration": 3.2431235313415527 + }, + { + "auxiliary_loss_clip": 0.01501434, + "auxiliary_loss_mlp": 0.01277946, + "balance_loss_clip": 1.13680553, + "balance_loss_mlp": 1.0172112, + "epoch": 0.437563879035652, + "flos": 24174778016160.0, + "grad_norm": 1.6634174805248507, + "language_loss": 0.76839256, + "learning_rate": 2.4964139540690018e-06, + "loss": 0.79618639, + "num_input_tokens_seen": 78392235, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.60742188, + "step": 3639, + "time_per_iteration": 4.069352865219116 + }, + { + "auxiliary_loss_clip": 0.01502039, + "auxiliary_loss_mlp": 0.01293994, + "balance_loss_clip": 1.13666904, + "balance_loss_mlp": 1.02829993, + "epoch": 0.4376841219262911, + "flos": 23479472064960.0, + "grad_norm": 1.9099501826431677, + "language_loss": 0.72837019, + "learning_rate": 2.495659321129758e-06, + "loss": 0.75633049, + "num_input_tokens_seen": 78409980, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.65820312, + "step": 3640, + "time_per_iteration": 3.1148717403411865 + }, + { + "auxiliary_loss_clip": 0.01496612, + "auxiliary_loss_mlp": 0.01281839, + "balance_loss_clip": 1.13201165, + "balance_loss_mlp": 1.0195787, + "epoch": 0.4378043648169302, + "flos": 25450201003680.0, + "grad_norm": 1.8001858543460203, + "language_loss": 0.76173079, + "learning_rate": 2.494904612999389e-06, + "loss": 0.78951532, + "num_input_tokens_seen": 78428690, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.62304688, + "step": 3641, + "time_per_iteration": 3.9723384380340576 + }, + { + "auxiliary_loss_clip": 0.01558344, + "auxiliary_loss_mlp": 0.01226196, + "balance_loss_clip": 1.19936013, + "balance_loss_mlp": 1.01028442, + "epoch": 0.4379246077075693, + "flos": 53920499549760.0, + "grad_norm": 0.7561848395814476, + "language_loss": 0.56504393, + "learning_rate": 2.4941498297923843e-06, + "loss": 0.59288943, + "num_input_tokens_seen": 78489260, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.1640625, + "step": 3642, + "time_per_iteration": 3.4481637477874756 + }, + { + "auxiliary_loss_clip": 0.01504504, + "auxiliary_loss_mlp": 0.01290381, + "balance_loss_clip": 1.13860035, + "balance_loss_mlp": 1.02735734, + "epoch": 0.43804485059820836, + "flos": 20590038126720.0, + "grad_norm": 1.968052632601382, + "language_loss": 0.6962719, + "learning_rate": 2.4933949716232424e-06, + "loss": 0.72422069, + "num_input_tokens_seen": 78506785, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.63085938, + "step": 3643, + "time_per_iteration": 3.90789532661438 + }, + { + "auxiliary_loss_clip": 0.01499458, + "auxiliary_loss_mlp": 0.0128755, + "balance_loss_clip": 1.13333714, + "balance_loss_mlp": 1.02490854, + "epoch": 0.43816509348884747, + "flos": 23878366531200.0, + "grad_norm": 2.2685730088869036, + "language_loss": 0.7395491, + "learning_rate": 2.492640038606476e-06, + "loss": 0.76741922, + "num_input_tokens_seen": 78525150, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.62695312, + "step": 3644, + "time_per_iteration": 3.114626884460449 + }, + { + "auxiliary_loss_clip": 0.01492691, + "auxiliary_loss_mlp": 0.01291832, + "balance_loss_clip": 1.12752175, + "balance_loss_mlp": 1.02804601, + "epoch": 0.4382853363794866, + "flos": 14686233433920.0, + "grad_norm": 2.220205987917685, + "language_loss": 0.7885896, + "learning_rate": 2.491885030856608e-06, + "loss": 0.81643486, + "num_input_tokens_seen": 78543245, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.63867188, + "step": 3645, + "time_per_iteration": 3.304095506668091 + }, + { + "auxiliary_loss_clip": 0.01501961, + "auxiliary_loss_mlp": 0.01286879, + "balance_loss_clip": 1.13712764, + "balance_loss_mlp": 1.0227108, + "epoch": 0.43840557927012563, + "flos": 17167215009600.0, + "grad_norm": 2.547977197897214, + "language_loss": 0.82810026, + "learning_rate": 2.4911299484881713e-06, + "loss": 0.85598862, + "num_input_tokens_seen": 78560775, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.64257812, + "step": 3646, + "time_per_iteration": 4.033006191253662 + }, + { + "auxiliary_loss_clip": 0.01496393, + "auxiliary_loss_mlp": 0.01284544, + "balance_loss_clip": 1.13256943, + "balance_loss_mlp": 1.02228391, + "epoch": 0.43852582216076474, + "flos": 19392709949280.0, + "grad_norm": 1.6413537077784834, + "language_loss": 0.81339818, + "learning_rate": 2.490374791615712e-06, + "loss": 0.84120756, + "num_input_tokens_seen": 78580800, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.62304688, + "step": 3647, + "time_per_iteration": 3.0931386947631836 + }, + { + "auxiliary_loss_clip": 0.01507478, + "auxiliary_loss_mlp": 0.01304572, + "balance_loss_clip": 1.14410377, + "balance_loss_mlp": 1.03887868, + "epoch": 0.43864606505140386, + "flos": 18076968963360.0, + "grad_norm": 4.000160232639307, + "language_loss": 0.78078973, + "learning_rate": 2.4896195603537867e-06, + "loss": 0.80891019, + "num_input_tokens_seen": 78595410, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.65820312, + "step": 3648, + "time_per_iteration": 2.9484825134277344 + }, + { + "auxiliary_loss_clip": 0.0151201, + "auxiliary_loss_mlp": 0.01278771, + "balance_loss_clip": 1.14854765, + "balance_loss_mlp": 1.01651001, + "epoch": 0.4387663079420429, + "flos": 19646869099680.0, + "grad_norm": 2.0675872124442782, + "language_loss": 0.74450374, + "learning_rate": 2.488864254816964e-06, + "loss": 0.77241158, + "num_input_tokens_seen": 78614100, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.62304688, + "step": 3649, + "time_per_iteration": 2.934699296951294 + }, + { + "auxiliary_loss_clip": 0.01502585, + "auxiliary_loss_mlp": 0.01291468, + "balance_loss_clip": 1.1375941, + "balance_loss_mlp": 1.02424896, + "epoch": 0.438886550832682, + "flos": 19721398662720.0, + "grad_norm": 2.2578079429468176, + "language_loss": 0.6824044, + "learning_rate": 2.4881088751198218e-06, + "loss": 0.71034491, + "num_input_tokens_seen": 78632260, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.67382812, + "step": 3650, + "time_per_iteration": 2.9819576740264893 + }, + { + "auxiliary_loss_clip": 0.01497486, + "auxiliary_loss_mlp": 0.01286887, + "balance_loss_clip": 1.13274086, + "balance_loss_mlp": 1.02367294, + "epoch": 0.43900679372332113, + "flos": 14538350080800.0, + "grad_norm": 3.2987712487872454, + "language_loss": 0.64156151, + "learning_rate": 2.4873534213769517e-06, + "loss": 0.66940522, + "num_input_tokens_seen": 78647490, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.6328125, + "step": 3651, + "time_per_iteration": 2.958679676055908 + }, + { + "auxiliary_loss_clip": 0.01501019, + "auxiliary_loss_mlp": 0.01289514, + "balance_loss_clip": 1.1357137, + "balance_loss_mlp": 1.02706313, + "epoch": 0.4391270366139602, + "flos": 24058261615680.0, + "grad_norm": 1.637817236868143, + "language_loss": 0.72134858, + "learning_rate": 2.4865978937029547e-06, + "loss": 0.74925387, + "num_input_tokens_seen": 78666470, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.625, + "step": 3652, + "time_per_iteration": 2.977698802947998 + }, + { + "auxiliary_loss_clip": 0.01498741, + "auxiliary_loss_mlp": 0.01288139, + "balance_loss_clip": 1.13550913, + "balance_loss_mlp": 1.02435267, + "epoch": 0.4392472795045993, + "flos": 31541107131360.0, + "grad_norm": 1.7351166365363977, + "language_loss": 0.66441077, + "learning_rate": 2.485842292212445e-06, + "loss": 0.69227958, + "num_input_tokens_seen": 78687685, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.63867188, + "step": 3653, + "time_per_iteration": 3.1133759021759033 + }, + { + "auxiliary_loss_clip": 0.01502285, + "auxiliary_loss_mlp": 0.0130871, + "balance_loss_clip": 1.13859582, + "balance_loss_mlp": 1.04397011, + "epoch": 0.4393675223952384, + "flos": 14868062854560.0, + "grad_norm": 3.8157257265172477, + "language_loss": 0.80755568, + "learning_rate": 2.485086617020045e-06, + "loss": 0.83566564, + "num_input_tokens_seen": 78706180, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.6484375, + "step": 3654, + "time_per_iteration": 2.96333646774292 + }, + { + "auxiliary_loss_clip": 0.01497911, + "auxiliary_loss_mlp": 0.01292537, + "balance_loss_clip": 1.13281798, + "balance_loss_mlp": 1.0262711, + "epoch": 0.43948776528587746, + "flos": 14827441430880.0, + "grad_norm": 2.694768818913875, + "language_loss": 0.82054126, + "learning_rate": 2.4843308682403903e-06, + "loss": 0.84844571, + "num_input_tokens_seen": 78723095, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.6640625, + "step": 3655, + "time_per_iteration": 3.0371036529541016 + }, + { + "auxiliary_loss_clip": 0.01492208, + "auxiliary_loss_mlp": 0.01286392, + "balance_loss_clip": 1.12775946, + "balance_loss_mlp": 1.02527618, + "epoch": 0.4396080081765166, + "flos": 13916056566240.0, + "grad_norm": 1.8685698262288606, + "language_loss": 0.8286339, + "learning_rate": 2.4835750459881294e-06, + "loss": 0.85641992, + "num_input_tokens_seen": 78739720, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.61132812, + "step": 3656, + "time_per_iteration": 2.992812156677246 + }, + { + "auxiliary_loss_clip": 0.01503804, + "auxiliary_loss_mlp": 0.01292255, + "balance_loss_clip": 1.14106178, + "balance_loss_mlp": 1.02904081, + "epoch": 0.43972825106715563, + "flos": 18224928172800.0, + "grad_norm": 2.7114547915538534, + "language_loss": 0.81915408, + "learning_rate": 2.4828191503779177e-06, + "loss": 0.84711468, + "num_input_tokens_seen": 78757820, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.6328125, + "step": 3657, + "time_per_iteration": 3.1523351669311523 + }, + { + "auxiliary_loss_clip": 0.01492803, + "auxiliary_loss_mlp": 0.01282896, + "balance_loss_clip": 1.12754726, + "balance_loss_mlp": 1.02158976, + "epoch": 0.43984849395779474, + "flos": 16875127334880.0, + "grad_norm": 2.357770722535601, + "language_loss": 0.89976501, + "learning_rate": 2.482063181524425e-06, + "loss": 0.92752206, + "num_input_tokens_seen": 78773720, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.61328125, + "step": 3658, + "time_per_iteration": 3.1248323917388916 + }, + { + "auxiliary_loss_clip": 0.01498892, + "auxiliary_loss_mlp": 0.01294326, + "balance_loss_clip": 1.13457942, + "balance_loss_mlp": 1.02748799, + "epoch": 0.43996873684843385, + "flos": 18693156044160.0, + "grad_norm": 3.2319449335813433, + "language_loss": 0.81364936, + "learning_rate": 2.4813071395423307e-06, + "loss": 0.84158158, + "num_input_tokens_seen": 78791285, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.66992188, + "step": 3659, + "time_per_iteration": 3.105921506881714 + }, + { + "auxiliary_loss_clip": 0.01490208, + "auxiliary_loss_mlp": 0.01296164, + "balance_loss_clip": 1.12656128, + "balance_loss_mlp": 1.03294992, + "epoch": 0.4400889797390729, + "flos": 23655384692640.0, + "grad_norm": 2.28035898546202, + "language_loss": 0.64692271, + "learning_rate": 2.4805510245463263e-06, + "loss": 0.67478645, + "num_input_tokens_seen": 78811440, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.6328125, + "step": 3660, + "time_per_iteration": 3.186452865600586 + }, + { + "auxiliary_loss_clip": 0.01497239, + "auxiliary_loss_mlp": 0.0129701, + "balance_loss_clip": 1.13405824, + "balance_loss_mlp": 1.03360522, + "epoch": 0.440209222629712, + "flos": 23151466058400.0, + "grad_norm": 2.2162516163242234, + "language_loss": 0.60535502, + "learning_rate": 2.4797948366511137e-06, + "loss": 0.63329756, + "num_input_tokens_seen": 78831150, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.63476562, + "step": 3661, + "time_per_iteration": 3.051053524017334 + }, + { + "auxiliary_loss_clip": 0.01493304, + "auxiliary_loss_mlp": 0.01291525, + "balance_loss_clip": 1.12934804, + "balance_loss_mlp": 1.02659464, + "epoch": 0.4403294655203511, + "flos": 24825518015040.0, + "grad_norm": 2.175876558366887, + "language_loss": 0.76148224, + "learning_rate": 2.4790385759714055e-06, + "loss": 0.78933048, + "num_input_tokens_seen": 78850215, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.65039062, + "step": 3662, + "time_per_iteration": 3.0124144554138184 + }, + { + "auxiliary_loss_clip": 0.01496393, + "auxiliary_loss_mlp": 0.01281945, + "balance_loss_clip": 1.13311028, + "balance_loss_mlp": 1.01987529, + "epoch": 0.4404497084109902, + "flos": 22567290708960.0, + "grad_norm": 1.9241627070487668, + "language_loss": 0.71322751, + "learning_rate": 2.478282242621926e-06, + "loss": 0.74101079, + "num_input_tokens_seen": 78870675, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.62109375, + "step": 3663, + "time_per_iteration": 3.182375192642212 + }, + { + "auxiliary_loss_clip": 0.01515936, + "auxiliary_loss_mlp": 0.01208923, + "balance_loss_clip": 1.16097164, + "balance_loss_mlp": 0.99453735, + "epoch": 0.4405699513016293, + "flos": 64974013607520.0, + "grad_norm": 0.8467044632308856, + "language_loss": 0.59507763, + "learning_rate": 2.477525836717411e-06, + "loss": 0.62232625, + "num_input_tokens_seen": 78938440, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.1484375, + "step": 3664, + "time_per_iteration": 3.645204544067383 + }, + { + "auxiliary_loss_clip": 0.014938, + "auxiliary_loss_mlp": 0.01296068, + "balance_loss_clip": 1.13130307, + "balance_loss_mlp": 1.03247261, + "epoch": 0.4406901941922684, + "flos": 35664811639200.0, + "grad_norm": 2.2727123688599216, + "language_loss": 0.79855454, + "learning_rate": 2.476769358372606e-06, + "loss": 0.82645321, + "num_input_tokens_seen": 78960090, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.63671875, + "step": 3665, + "time_per_iteration": 3.116884231567383 + }, + { + "auxiliary_loss_clip": 0.01502712, + "auxiliary_loss_mlp": 0.01291981, + "balance_loss_clip": 1.13919568, + "balance_loss_mlp": 1.02972078, + "epoch": 0.44081043708290746, + "flos": 18042719470560.0, + "grad_norm": 3.0575518198567693, + "language_loss": 0.74203849, + "learning_rate": 2.4760128077022683e-06, + "loss": 0.76998544, + "num_input_tokens_seen": 78978225, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.62304688, + "step": 3666, + "time_per_iteration": 3.070983648300171 + }, + { + "auxiliary_loss_clip": 0.01494225, + "auxiliary_loss_mlp": 0.01292961, + "balance_loss_clip": 1.1320318, + "balance_loss_mlp": 1.03012884, + "epoch": 0.44093067997354657, + "flos": 30156108596640.0, + "grad_norm": 5.946617344399876, + "language_loss": 0.68752658, + "learning_rate": 2.4752561848211672e-06, + "loss": 0.71539843, + "num_input_tokens_seen": 79000625, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.62890625, + "step": 3667, + "time_per_iteration": 3.8655529022216797 + }, + { + "auxiliary_loss_clip": 0.0150191, + "auxiliary_loss_mlp": 0.01289842, + "balance_loss_clip": 1.13991094, + "balance_loss_mlp": 1.02891684, + "epoch": 0.4410509228641857, + "flos": 23257135005120.0, + "grad_norm": 1.8901425770391982, + "language_loss": 0.71402168, + "learning_rate": 2.4744994898440797e-06, + "loss": 0.74193919, + "num_input_tokens_seen": 79019415, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.609375, + "step": 3668, + "time_per_iteration": 3.8778913021087646 + }, + { + "auxiliary_loss_clip": 0.01495869, + "auxiliary_loss_mlp": 0.01301789, + "balance_loss_clip": 1.13338447, + "balance_loss_mlp": 1.03704858, + "epoch": 0.44117116575482473, + "flos": 19502626849920.0, + "grad_norm": 2.1469483423181583, + "language_loss": 0.83585143, + "learning_rate": 2.473742722885797e-06, + "loss": 0.86382794, + "num_input_tokens_seen": 79038435, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.6484375, + "step": 3669, + "time_per_iteration": 2.98380446434021 + }, + { + "auxiliary_loss_clip": 0.01502436, + "auxiliary_loss_mlp": 0.01318533, + "balance_loss_clip": 1.14049196, + "balance_loss_mlp": 1.05112231, + "epoch": 0.44129140864546385, + "flos": 27055602262080.0, + "grad_norm": 2.8634308667727546, + "language_loss": 0.65270555, + "learning_rate": 2.4729858840611197e-06, + "loss": 0.68091524, + "num_input_tokens_seen": 79057345, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.67578125, + "step": 3670, + "time_per_iteration": 3.8188884258270264 + }, + { + "auxiliary_loss_clip": 0.01497102, + "auxiliary_loss_mlp": 0.01302937, + "balance_loss_clip": 1.13456774, + "balance_loss_mlp": 1.03800607, + "epoch": 0.4414116515361029, + "flos": 26104392465120.0, + "grad_norm": 1.9594461153306122, + "language_loss": 0.73027283, + "learning_rate": 2.4722289734848605e-06, + "loss": 0.75827324, + "num_input_tokens_seen": 79077810, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.65039062, + "step": 3671, + "time_per_iteration": 2.9588515758514404 + }, + { + "auxiliary_loss_clip": 0.01494349, + "auxiliary_loss_mlp": 0.01299534, + "balance_loss_clip": 1.13242817, + "balance_loss_mlp": 1.03574753, + "epoch": 0.441531894426742, + "flos": 21908130658560.0, + "grad_norm": 2.3955168201918524, + "language_loss": 0.77855104, + "learning_rate": 2.471471991271841e-06, + "loss": 0.80648983, + "num_input_tokens_seen": 79094935, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.63867188, + "step": 3672, + "time_per_iteration": 3.0263872146606445 + }, + { + "auxiliary_loss_clip": 0.01491244, + "auxiliary_loss_mlp": 0.013024, + "balance_loss_clip": 1.12660265, + "balance_loss_mlp": 1.03823209, + "epoch": 0.4416521373173811, + "flos": 23439154066560.0, + "grad_norm": 4.134156762254085, + "language_loss": 0.79514802, + "learning_rate": 2.470714937536896e-06, + "loss": 0.82308441, + "num_input_tokens_seen": 79113660, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.64257812, + "step": 3673, + "time_per_iteration": 3.799823045730591 + }, + { + "auxiliary_loss_clip": 0.0149393, + "auxiliary_loss_mlp": 0.01298822, + "balance_loss_clip": 1.13259459, + "balance_loss_mlp": 1.0310303, + "epoch": 0.4417723802080202, + "flos": 20336144473440.0, + "grad_norm": 2.4598172225595176, + "language_loss": 0.71053946, + "learning_rate": 2.469957812394868e-06, + "loss": 0.73846698, + "num_input_tokens_seen": 79132470, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.6796875, + "step": 3674, + "time_per_iteration": 2.9566569328308105 + }, + { + "auxiliary_loss_clip": 0.0149869, + "auxiliary_loss_mlp": 0.01297192, + "balance_loss_clip": 1.13691521, + "balance_loss_mlp": 1.0334053, + "epoch": 0.4418926230986593, + "flos": 18882722809440.0, + "grad_norm": 2.1747759337451593, + "language_loss": 0.76625025, + "learning_rate": 2.4692006159606148e-06, + "loss": 0.79420906, + "num_input_tokens_seen": 79150000, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.63867188, + "step": 3675, + "time_per_iteration": 2.9836642742156982 + }, + { + "auxiliary_loss_clip": 0.01500673, + "auxiliary_loss_mlp": 0.01310058, + "balance_loss_clip": 1.14085388, + "balance_loss_mlp": 1.04360163, + "epoch": 0.4420128659892984, + "flos": 19466480949120.0, + "grad_norm": 1.831491681103113, + "language_loss": 0.78700781, + "learning_rate": 2.468443348349e-06, + "loss": 0.81511515, + "num_input_tokens_seen": 79167875, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.66601562, + "step": 3676, + "time_per_iteration": 3.0860865116119385 + }, + { + "auxiliary_loss_clip": 0.01503156, + "auxiliary_loss_mlp": 0.01297081, + "balance_loss_clip": 1.14217794, + "balance_loss_mlp": 1.03119707, + "epoch": 0.44213310887993745, + "flos": 17896277387520.0, + "grad_norm": 3.0115712028467128, + "language_loss": 0.82842815, + "learning_rate": 2.467686009674902e-06, + "loss": 0.85643053, + "num_input_tokens_seen": 79182325, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.66015625, + "step": 3677, + "time_per_iteration": 3.0159783363342285 + }, + { + "auxiliary_loss_clip": 0.01499871, + "auxiliary_loss_mlp": 0.01299823, + "balance_loss_clip": 1.13789845, + "balance_loss_mlp": 1.03431976, + "epoch": 0.44225335177057656, + "flos": 19206480862080.0, + "grad_norm": 2.1947700879327567, + "language_loss": 0.85242844, + "learning_rate": 2.466928600053209e-06, + "loss": 0.88042539, + "num_input_tokens_seen": 79197630, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.65625, + "step": 3678, + "time_per_iteration": 3.135272979736328 + }, + { + "auxiliary_loss_clip": 0.01493742, + "auxiliary_loss_mlp": 0.01291144, + "balance_loss_clip": 1.13100863, + "balance_loss_mlp": 1.02792978, + "epoch": 0.4423735946612157, + "flos": 23473251846720.0, + "grad_norm": 1.9081980818847226, + "language_loss": 0.71350121, + "learning_rate": 2.466171119598818e-06, + "loss": 0.74135005, + "num_input_tokens_seen": 79217600, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.6328125, + "step": 3679, + "time_per_iteration": 3.0495448112487793 + }, + { + "auxiliary_loss_clip": 0.01495013, + "auxiliary_loss_mlp": 0.01291826, + "balance_loss_clip": 1.13311148, + "balance_loss_mlp": 1.02613258, + "epoch": 0.44249383755185473, + "flos": 26687695466880.0, + "grad_norm": 2.9590726669637766, + "language_loss": 0.7736367, + "learning_rate": 2.465413568426639e-06, + "loss": 0.80150509, + "num_input_tokens_seen": 79238550, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.65820312, + "step": 3680, + "time_per_iteration": 3.11746883392334 + }, + { + "auxiliary_loss_clip": 0.01495535, + "auxiliary_loss_mlp": 0.01292639, + "balance_loss_clip": 1.13284206, + "balance_loss_mlp": 1.03114176, + "epoch": 0.44261408044249384, + "flos": 23149835147520.0, + "grad_norm": 1.7232545498052378, + "language_loss": 0.81519449, + "learning_rate": 2.464655946651591e-06, + "loss": 0.84307623, + "num_input_tokens_seen": 79257555, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.61523438, + "step": 3681, + "time_per_iteration": 3.1237869262695312 + }, + { + "auxiliary_loss_clip": 0.01495972, + "auxiliary_loss_mlp": 0.01292433, + "balance_loss_clip": 1.13469219, + "balance_loss_mlp": 1.02864683, + "epoch": 0.44273432333313295, + "flos": 24464893426560.0, + "grad_norm": 2.86659566194855, + "language_loss": 0.80837005, + "learning_rate": 2.4638982543886065e-06, + "loss": 0.83625412, + "num_input_tokens_seen": 79277595, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.63867188, + "step": 3682, + "time_per_iteration": 3.0456552505493164 + }, + { + "auxiliary_loss_clip": 0.01498338, + "auxiliary_loss_mlp": 0.01292695, + "balance_loss_clip": 1.13515615, + "balance_loss_mlp": 1.02986193, + "epoch": 0.442854566223772, + "flos": 17530797994560.0, + "grad_norm": 2.6571730025051665, + "language_loss": 0.87127995, + "learning_rate": 2.4631404917526254e-06, + "loss": 0.89919031, + "num_input_tokens_seen": 79294550, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.62890625, + "step": 3683, + "time_per_iteration": 3.095813751220703 + }, + { + "auxiliary_loss_clip": 0.01497067, + "auxiliary_loss_mlp": 0.0128873, + "balance_loss_clip": 1.13639784, + "balance_loss_mlp": 1.02646911, + "epoch": 0.4429748091144111, + "flos": 24898416667200.0, + "grad_norm": 1.8360800612160246, + "language_loss": 0.79183173, + "learning_rate": 2.4623826588586e-06, + "loss": 0.81968975, + "num_input_tokens_seen": 79314820, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.62304688, + "step": 3684, + "time_per_iteration": 3.0413739681243896 + }, + { + "auxiliary_loss_clip": 0.01488659, + "auxiliary_loss_mlp": 0.01286522, + "balance_loss_clip": 1.12686872, + "balance_loss_mlp": 1.02120972, + "epoch": 0.4430950520050502, + "flos": 21616763618880.0, + "grad_norm": 1.609022475012369, + "language_loss": 0.82855511, + "learning_rate": 2.461624755821492e-06, + "loss": 0.85630697, + "num_input_tokens_seen": 79334300, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.65429688, + "step": 3685, + "time_per_iteration": 3.100799798965454 + }, + { + "auxiliary_loss_clip": 0.01499996, + "auxiliary_loss_mlp": 0.01292167, + "balance_loss_clip": 1.1385572, + "balance_loss_mlp": 1.02609181, + "epoch": 0.4432152948956893, + "flos": 24574620686400.0, + "grad_norm": 2.0249553778992415, + "language_loss": 0.76533264, + "learning_rate": 2.4608667827562763e-06, + "loss": 0.79325426, + "num_input_tokens_seen": 79353630, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.66210938, + "step": 3686, + "time_per_iteration": 3.08276629447937 + }, + { + "auxiliary_loss_clip": 0.014985, + "auxiliary_loss_mlp": 0.01291122, + "balance_loss_clip": 1.13753629, + "balance_loss_mlp": 1.02542806, + "epoch": 0.4433355377863284, + "flos": 21764419403040.0, + "grad_norm": 1.9690386850268977, + "language_loss": 0.90244949, + "learning_rate": 2.460108739777936e-06, + "loss": 0.93034577, + "num_input_tokens_seen": 79372765, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.65820312, + "step": 3687, + "time_per_iteration": 3.0674166679382324 + }, + { + "auxiliary_loss_clip": 0.01497806, + "auxiliary_loss_mlp": 0.01283443, + "balance_loss_clip": 1.13660347, + "balance_loss_mlp": 1.02118266, + "epoch": 0.44345578067696745, + "flos": 20086574630400.0, + "grad_norm": 1.6496691970533128, + "language_loss": 0.76513875, + "learning_rate": 2.4593506270014656e-06, + "loss": 0.79295123, + "num_input_tokens_seen": 79391735, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.62304688, + "step": 3688, + "time_per_iteration": 3.060532569885254 + }, + { + "auxiliary_loss_clip": 0.01489421, + "auxiliary_loss_mlp": 0.01287417, + "balance_loss_clip": 1.12926102, + "balance_loss_mlp": 1.0213418, + "epoch": 0.44357602356760656, + "flos": 24171743763360.0, + "grad_norm": 2.600957041813604, + "language_loss": 0.82006264, + "learning_rate": 2.45859244454187e-06, + "loss": 0.84783095, + "num_input_tokens_seen": 79411525, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.66210938, + "step": 3689, + "time_per_iteration": 3.012186050415039 + }, + { + "auxiliary_loss_clip": 0.01490296, + "auxiliary_loss_mlp": 0.01282015, + "balance_loss_clip": 1.12763143, + "balance_loss_mlp": 1.01994514, + "epoch": 0.44369626645824567, + "flos": 22709750335200.0, + "grad_norm": 1.6432805689237042, + "language_loss": 0.66120434, + "learning_rate": 2.4578341925141655e-06, + "loss": 0.68892747, + "num_input_tokens_seen": 79430740, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.62109375, + "step": 3690, + "time_per_iteration": 2.978546380996704 + }, + { + "auxiliary_loss_clip": 0.01501001, + "auxiliary_loss_mlp": 0.01285668, + "balance_loss_clip": 1.14043415, + "balance_loss_mlp": 1.02130973, + "epoch": 0.4438165093488847, + "flos": 38033031702240.0, + "grad_norm": 2.166669042695617, + "language_loss": 0.72667694, + "learning_rate": 2.457075871033378e-06, + "loss": 0.75454366, + "num_input_tokens_seen": 79452615, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.64453125, + "step": 3691, + "time_per_iteration": 3.131856679916382 + }, + { + "auxiliary_loss_clip": 0.0148842, + "auxiliary_loss_mlp": 0.01284849, + "balance_loss_clip": 1.12526393, + "balance_loss_mlp": 1.02239752, + "epoch": 0.44393675223952384, + "flos": 15525023071680.0, + "grad_norm": 2.3491894009013286, + "language_loss": 0.88972938, + "learning_rate": 2.4563174802145445e-06, + "loss": 0.91746205, + "num_input_tokens_seen": 79469865, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.625, + "step": 3692, + "time_per_iteration": 3.087475299835205 + }, + { + "auxiliary_loss_clip": 0.01493393, + "auxiliary_loss_mlp": 0.01260658, + "balance_loss_clip": 1.13512278, + "balance_loss_mlp": 1.04398346, + "epoch": 0.44405699513016295, + "flos": 64582325491680.0, + "grad_norm": 0.647297705771363, + "language_loss": 0.48554766, + "learning_rate": 2.455559020172712e-06, + "loss": 0.51308817, + "num_input_tokens_seen": 79537220, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.171875, + "step": 3693, + "time_per_iteration": 3.5609817504882812 + }, + { + "auxiliary_loss_clip": 0.01509762, + "auxiliary_loss_mlp": 0.01300787, + "balance_loss_clip": 1.14881539, + "balance_loss_mlp": 1.03566599, + "epoch": 0.444177238020802, + "flos": 23989648845600.0, + "grad_norm": 2.5801339982862936, + "language_loss": 0.89696246, + "learning_rate": 2.4548004910229385e-06, + "loss": 0.9250679, + "num_input_tokens_seen": 79554795, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.65234375, + "step": 3694, + "time_per_iteration": 3.8611080646514893 + }, + { + "auxiliary_loss_clip": 0.0149747, + "auxiliary_loss_mlp": 0.01295326, + "balance_loss_clip": 1.13583827, + "balance_loss_mlp": 1.03153992, + "epoch": 0.4442974809114411, + "flos": 22565166732000.0, + "grad_norm": 3.4509705578659435, + "language_loss": 0.86729383, + "learning_rate": 2.4540418928802913e-06, + "loss": 0.89522177, + "num_input_tokens_seen": 79573530, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.63867188, + "step": 3695, + "time_per_iteration": 3.936138868331909 + }, + { + "auxiliary_loss_clip": 0.01500693, + "auxiliary_loss_mlp": 0.01296214, + "balance_loss_clip": 1.13871753, + "balance_loss_mlp": 1.02937579, + "epoch": 0.4444177238020802, + "flos": 17677619359200.0, + "grad_norm": 2.637531994897262, + "language_loss": 0.65532625, + "learning_rate": 2.4532832258598506e-06, + "loss": 0.68329525, + "num_input_tokens_seen": 79591360, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.66992188, + "step": 3696, + "time_per_iteration": 3.039090871810913 + }, + { + "auxiliary_loss_clip": 0.01496517, + "auxiliary_loss_mlp": 0.01282855, + "balance_loss_clip": 1.13550019, + "balance_loss_mlp": 1.02173889, + "epoch": 0.4445379666927193, + "flos": 28624136984640.0, + "grad_norm": 2.1250824121676546, + "language_loss": 0.81092989, + "learning_rate": 2.4525244900767047e-06, + "loss": 0.83872366, + "num_input_tokens_seen": 79612175, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.61132812, + "step": 3697, + "time_per_iteration": 3.0922954082489014 + }, + { + "auxiliary_loss_clip": 0.01496927, + "auxiliary_loss_mlp": 0.01223907, + "balance_loss_clip": 1.13865662, + "balance_loss_mlp": 1.01028442, + "epoch": 0.4446582095833584, + "flos": 70495232942880.0, + "grad_norm": 0.7751508798856499, + "language_loss": 0.60538781, + "learning_rate": 2.4517656856459536e-06, + "loss": 0.63259614, + "num_input_tokens_seen": 79678020, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.140625, + "step": 3698, + "time_per_iteration": 4.503569602966309 + }, + { + "auxiliary_loss_clip": 0.01489393, + "auxiliary_loss_mlp": 0.0128234, + "balance_loss_clip": 1.12723327, + "balance_loss_mlp": 1.0191257, + "epoch": 0.4447784524739975, + "flos": 26507610741600.0, + "grad_norm": 2.157877565190555, + "language_loss": 0.68165958, + "learning_rate": 2.4510068126827073e-06, + "loss": 0.70937687, + "num_input_tokens_seen": 79699020, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.6328125, + "step": 3699, + "time_per_iteration": 3.0110743045806885 + }, + { + "auxiliary_loss_clip": 0.01498603, + "auxiliary_loss_mlp": 0.01292224, + "balance_loss_clip": 1.13763607, + "balance_loss_mlp": 1.03072667, + "epoch": 0.44489869536463655, + "flos": 11657829260160.0, + "grad_norm": 3.6656898051015174, + "language_loss": 0.81873852, + "learning_rate": 2.450247871302086e-06, + "loss": 0.84664679, + "num_input_tokens_seen": 79716795, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.61523438, + "step": 3700, + "time_per_iteration": 3.062509298324585 + }, + { + "auxiliary_loss_clip": 0.01494023, + "auxiliary_loss_mlp": 0.01286869, + "balance_loss_clip": 1.13270986, + "balance_loss_mlp": 1.0242269, + "epoch": 0.44501893825527566, + "flos": 20450536896960.0, + "grad_norm": 2.7007005039930343, + "language_loss": 0.83379567, + "learning_rate": 2.44948886161922e-06, + "loss": 0.86160457, + "num_input_tokens_seen": 79735810, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.62695312, + "step": 3701, + "time_per_iteration": 3.9136180877685547 + }, + { + "auxiliary_loss_clip": 0.01500927, + "auxiliary_loss_mlp": 0.01285336, + "balance_loss_clip": 1.14010239, + "balance_loss_mlp": 1.02479172, + "epoch": 0.4451391811459148, + "flos": 18263122194240.0, + "grad_norm": 1.675982898810928, + "language_loss": 0.85113287, + "learning_rate": 2.4487297837492524e-06, + "loss": 0.87899554, + "num_input_tokens_seen": 79754975, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.60546875, + "step": 3702, + "time_per_iteration": 3.109037399291992 + }, + { + "auxiliary_loss_clip": 0.01493066, + "auxiliary_loss_mlp": 0.0129525, + "balance_loss_clip": 1.13184416, + "balance_loss_mlp": 1.02955675, + "epoch": 0.44525942403655383, + "flos": 16912069727040.0, + "grad_norm": 2.27845518355391, + "language_loss": 0.62328541, + "learning_rate": 2.4479706378073323e-06, + "loss": 0.65116858, + "num_input_tokens_seen": 79773515, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.65820312, + "step": 3703, + "time_per_iteration": 3.103804588317871 + }, + { + "auxiliary_loss_clip": 0.01494409, + "auxiliary_loss_mlp": 0.0128379, + "balance_loss_clip": 1.13199186, + "balance_loss_mlp": 1.02305532, + "epoch": 0.44537966692719294, + "flos": 23261117461920.0, + "grad_norm": 1.912753782845424, + "language_loss": 0.8380264, + "learning_rate": 2.447211423908623e-06, + "loss": 0.86580837, + "num_input_tokens_seen": 79793560, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.60742188, + "step": 3704, + "time_per_iteration": 3.037578821182251 + }, + { + "auxiliary_loss_clip": 0.01498774, + "auxiliary_loss_mlp": 0.01280148, + "balance_loss_clip": 1.13726306, + "balance_loss_mlp": 1.0176971, + "epoch": 0.445499909817832, + "flos": 21726225381600.0, + "grad_norm": 2.297197216498953, + "language_loss": 0.74888098, + "learning_rate": 2.4464521421682966e-06, + "loss": 0.77667016, + "num_input_tokens_seen": 79811150, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.625, + "step": 3705, + "time_per_iteration": 3.0973286628723145 + }, + { + "auxiliary_loss_clip": 0.01499243, + "auxiliary_loss_mlp": 0.01279893, + "balance_loss_clip": 1.1385895, + "balance_loss_mlp": 1.02278209, + "epoch": 0.4456201527084711, + "flos": 23990255696160.0, + "grad_norm": 1.5366424026720058, + "language_loss": 0.87888777, + "learning_rate": 2.4456927927015345e-06, + "loss": 0.90667915, + "num_input_tokens_seen": 79832190, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.5703125, + "step": 3706, + "time_per_iteration": 3.048685312271118 + }, + { + "auxiliary_loss_clip": 0.01503732, + "auxiliary_loss_mlp": 0.01291972, + "balance_loss_clip": 1.14053786, + "balance_loss_mlp": 1.02627826, + "epoch": 0.4457403955991102, + "flos": 18809065594080.0, + "grad_norm": 2.9023251161748935, + "language_loss": 0.76760304, + "learning_rate": 2.4449333756235307e-06, + "loss": 0.79556012, + "num_input_tokens_seen": 79848905, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.65820312, + "step": 3707, + "time_per_iteration": 2.960448741912842 + }, + { + "auxiliary_loss_clip": 0.01509426, + "auxiliary_loss_mlp": 0.01293555, + "balance_loss_clip": 1.14894569, + "balance_loss_mlp": 1.02957797, + "epoch": 0.4458606384897493, + "flos": 19209742683840.0, + "grad_norm": 2.8929148542646366, + "language_loss": 0.79086125, + "learning_rate": 2.4441738910494876e-06, + "loss": 0.81889111, + "num_input_tokens_seen": 79863640, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.640625, + "step": 3708, + "time_per_iteration": 2.971025228500366 + }, + { + "auxiliary_loss_clip": 0.01505243, + "auxiliary_loss_mlp": 0.01299685, + "balance_loss_clip": 1.14209116, + "balance_loss_mlp": 1.03399134, + "epoch": 0.4459808813803884, + "flos": 21363438888000.0, + "grad_norm": 2.6359155484054937, + "language_loss": 0.82117343, + "learning_rate": 2.4434143390946176e-06, + "loss": 0.84922272, + "num_input_tokens_seen": 79882450, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.65820312, + "step": 3709, + "time_per_iteration": 2.9461658000946045 + }, + { + "auxiliary_loss_clip": 0.01503821, + "auxiliary_loss_mlp": 0.01289687, + "balance_loss_clip": 1.14057684, + "balance_loss_mlp": 1.02723539, + "epoch": 0.4461011242710275, + "flos": 23290891431840.0, + "grad_norm": 2.8952637831540358, + "language_loss": 0.85343492, + "learning_rate": 2.4426547198741457e-06, + "loss": 0.88137001, + "num_input_tokens_seen": 79900655, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.625, + "step": 3710, + "time_per_iteration": 3.018670082092285 + }, + { + "auxiliary_loss_clip": 0.01510383, + "auxiliary_loss_mlp": 0.01291574, + "balance_loss_clip": 1.14870405, + "balance_loss_mlp": 1.02569008, + "epoch": 0.44622136716166655, + "flos": 20195050260960.0, + "grad_norm": 2.367446711728568, + "language_loss": 0.74446386, + "learning_rate": 2.441895033503305e-06, + "loss": 0.77248341, + "num_input_tokens_seen": 79918575, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.66015625, + "step": 3711, + "time_per_iteration": 3.026386260986328 + }, + { + "auxiliary_loss_clip": 0.0151396, + "auxiliary_loss_mlp": 0.01287155, + "balance_loss_clip": 1.15084326, + "balance_loss_mlp": 1.0226059, + "epoch": 0.44634161005230566, + "flos": 21284888940000.0, + "grad_norm": 2.1622727834895903, + "language_loss": 0.82285804, + "learning_rate": 2.4411352800973375e-06, + "loss": 0.85086918, + "num_input_tokens_seen": 79937010, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.64648438, + "step": 3712, + "time_per_iteration": 2.998096466064453 + }, + { + "auxiliary_loss_clip": 0.01508376, + "auxiliary_loss_mlp": 0.01292767, + "balance_loss_clip": 1.14622378, + "balance_loss_mlp": 1.02688289, + "epoch": 0.44646185294294477, + "flos": 22931556400800.0, + "grad_norm": 4.785957696174228, + "language_loss": 0.75242329, + "learning_rate": 2.4403754597715005e-06, + "loss": 0.78043473, + "num_input_tokens_seen": 79956455, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.66015625, + "step": 3713, + "time_per_iteration": 3.0191571712493896 + }, + { + "auxiliary_loss_clip": 0.01516532, + "auxiliary_loss_mlp": 0.01303201, + "balance_loss_clip": 1.15393233, + "balance_loss_mlp": 1.03903341, + "epoch": 0.4465820958335838, + "flos": 22639885935840.0, + "grad_norm": 2.9285492054892175, + "language_loss": 0.93082094, + "learning_rate": 2.4396155726410553e-06, + "loss": 0.95901835, + "num_input_tokens_seen": 79975065, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.64257812, + "step": 3714, + "time_per_iteration": 3.0646796226501465 + }, + { + "auxiliary_loss_clip": 0.01512478, + "auxiliary_loss_mlp": 0.01286817, + "balance_loss_clip": 1.14930785, + "balance_loss_mlp": 1.02264905, + "epoch": 0.44670233872422294, + "flos": 22674628494720.0, + "grad_norm": 2.625359681497566, + "language_loss": 0.91366732, + "learning_rate": 2.438855618821278e-06, + "loss": 0.94166023, + "num_input_tokens_seen": 79990865, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.64257812, + "step": 3715, + "time_per_iteration": 3.0999979972839355 + }, + { + "auxiliary_loss_clip": 0.01520662, + "auxiliary_loss_mlp": 0.01290874, + "balance_loss_clip": 1.15788639, + "balance_loss_mlp": 1.02785039, + "epoch": 0.44682258161486205, + "flos": 23584192807680.0, + "grad_norm": 3.1980559772447705, + "language_loss": 0.67467284, + "learning_rate": 2.4380955984274517e-06, + "loss": 0.70278817, + "num_input_tokens_seen": 80009520, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.63085938, + "step": 3716, + "time_per_iteration": 3.1141748428344727 + }, + { + "auxiliary_loss_clip": 0.01517152, + "auxiliary_loss_mlp": 0.0129218, + "balance_loss_clip": 1.15403676, + "balance_loss_mlp": 1.02724957, + "epoch": 0.4469428245055011, + "flos": 26503552428480.0, + "grad_norm": 3.3070582081225037, + "language_loss": 0.76937389, + "learning_rate": 2.4373355115748716e-06, + "loss": 0.79746723, + "num_input_tokens_seen": 80030350, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.65039062, + "step": 3717, + "time_per_iteration": 3.05008864402771 + }, + { + "auxiliary_loss_clip": 0.01519128, + "auxiliary_loss_mlp": 0.01289769, + "balance_loss_clip": 1.15593719, + "balance_loss_mlp": 1.03017867, + "epoch": 0.4470630673961402, + "flos": 21506960502720.0, + "grad_norm": 2.054085362369723, + "language_loss": 0.72277313, + "learning_rate": 2.436575358378842e-06, + "loss": 0.75086212, + "num_input_tokens_seen": 80049840, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.59570312, + "step": 3718, + "time_per_iteration": 3.0556819438934326 + }, + { + "auxiliary_loss_clip": 0.01527721, + "auxiliary_loss_mlp": 0.01299698, + "balance_loss_clip": 1.16497171, + "balance_loss_mlp": 1.03419542, + "epoch": 0.44718331028677927, + "flos": 16175497573440.0, + "grad_norm": 3.7345195132107207, + "language_loss": 0.83286083, + "learning_rate": 2.4358151389546782e-06, + "loss": 0.86113501, + "num_input_tokens_seen": 80066525, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.65625, + "step": 3719, + "time_per_iteration": 3.0131731033325195 + }, + { + "auxiliary_loss_clip": 0.01518964, + "auxiliary_loss_mlp": 0.01293433, + "balance_loss_clip": 1.15695119, + "balance_loss_mlp": 1.03098178, + "epoch": 0.4473035531774184, + "flos": 19683204641280.0, + "grad_norm": 2.3187479808849747, + "language_loss": 0.75828201, + "learning_rate": 2.4350548534177035e-06, + "loss": 0.78640598, + "num_input_tokens_seen": 80083355, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.625, + "step": 3720, + "time_per_iteration": 3.1031477451324463 + }, + { + "auxiliary_loss_clip": 0.01518416, + "auxiliary_loss_mlp": 0.01296608, + "balance_loss_clip": 1.15513337, + "balance_loss_mlp": 1.03472877, + "epoch": 0.4474237960680575, + "flos": 41430821869440.0, + "grad_norm": 1.9970223138014487, + "language_loss": 0.66762227, + "learning_rate": 2.434294501883254e-06, + "loss": 0.69577247, + "num_input_tokens_seen": 80106450, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.61914062, + "step": 3721, + "time_per_iteration": 4.076504707336426 + }, + { + "auxiliary_loss_clip": 0.01520383, + "auxiliary_loss_mlp": 0.0129255, + "balance_loss_clip": 1.15618336, + "balance_loss_mlp": 1.02742839, + "epoch": 0.44754403895869654, + "flos": 22893438235680.0, + "grad_norm": 3.3005917234324245, + "language_loss": 0.65749323, + "learning_rate": 2.433534084466674e-06, + "loss": 0.68562257, + "num_input_tokens_seen": 80125670, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.65234375, + "step": 3722, + "time_per_iteration": 3.0507986545562744 + }, + { + "auxiliary_loss_clip": 0.01515738, + "auxiliary_loss_mlp": 0.01295982, + "balance_loss_clip": 1.15271664, + "balance_loss_mlp": 1.03467488, + "epoch": 0.44766428184933565, + "flos": 25632675203040.0, + "grad_norm": 1.5036547283800548, + "language_loss": 0.70956028, + "learning_rate": 2.4327736012833178e-06, + "loss": 0.73767751, + "num_input_tokens_seen": 80147390, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.61328125, + "step": 3723, + "time_per_iteration": 3.9877562522888184 + }, + { + "auxiliary_loss_clip": 0.01519349, + "auxiliary_loss_mlp": 0.01281908, + "balance_loss_clip": 1.15636647, + "balance_loss_mlp": 1.01735926, + "epoch": 0.44778452473997477, + "flos": 20451067891200.0, + "grad_norm": 1.9721945513174068, + "language_loss": 0.76751959, + "learning_rate": 2.4320130524485506e-06, + "loss": 0.79553223, + "num_input_tokens_seen": 80166185, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.64648438, + "step": 3724, + "time_per_iteration": 3.062164783477783 + }, + { + "auxiliary_loss_clip": 0.0152239, + "auxiliary_loss_mlp": 0.01284911, + "balance_loss_clip": 1.15973592, + "balance_loss_mlp": 1.02474856, + "epoch": 0.4479047676306138, + "flos": 21977312351040.0, + "grad_norm": 1.536508601924197, + "language_loss": 0.79591429, + "learning_rate": 2.431252438077746e-06, + "loss": 0.82398725, + "num_input_tokens_seen": 80185685, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.6015625, + "step": 3725, + "time_per_iteration": 3.8942761421203613 + }, + { + "auxiliary_loss_clip": 0.01524447, + "auxiliary_loss_mlp": 0.01289061, + "balance_loss_clip": 1.16104269, + "balance_loss_mlp": 1.02527428, + "epoch": 0.44802501052125293, + "flos": 21469183691040.0, + "grad_norm": 2.326249578526915, + "language_loss": 0.77035344, + "learning_rate": 2.4304917582862906e-06, + "loss": 0.7984885, + "num_input_tokens_seen": 80204865, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.63867188, + "step": 3726, + "time_per_iteration": 3.0742080211639404 + }, + { + "auxiliary_loss_clip": 0.01522754, + "auxiliary_loss_mlp": 0.01285198, + "balance_loss_clip": 1.15925181, + "balance_loss_mlp": 1.02350998, + "epoch": 0.44814525341189204, + "flos": 22129140232800.0, + "grad_norm": 2.0913712729127845, + "language_loss": 0.87473536, + "learning_rate": 2.4297310131895774e-06, + "loss": 0.90281492, + "num_input_tokens_seen": 80223410, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.6171875, + "step": 3727, + "time_per_iteration": 3.018705129623413 + }, + { + "auxiliary_loss_clip": 0.0151569, + "auxiliary_loss_mlp": 0.01279486, + "balance_loss_clip": 1.15135503, + "balance_loss_mlp": 1.017416, + "epoch": 0.4482654963025311, + "flos": 16655786599680.0, + "grad_norm": 2.9184629968967926, + "language_loss": 0.74721771, + "learning_rate": 2.4289702029030113e-06, + "loss": 0.77516949, + "num_input_tokens_seen": 80240880, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.62109375, + "step": 3728, + "time_per_iteration": 3.8200440406799316 + }, + { + "auxiliary_loss_clip": 0.01524312, + "auxiliary_loss_mlp": 0.01290001, + "balance_loss_clip": 1.16150045, + "balance_loss_mlp": 1.02621508, + "epoch": 0.4483857391931702, + "flos": 18843125446080.0, + "grad_norm": 2.1328672899495773, + "language_loss": 0.83364904, + "learning_rate": 2.4282093275420057e-06, + "loss": 0.86179215, + "num_input_tokens_seen": 80259910, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.63867188, + "step": 3729, + "time_per_iteration": 2.9561614990234375 + }, + { + "auxiliary_loss_clip": 0.01517755, + "auxiliary_loss_mlp": 0.01284856, + "balance_loss_clip": 1.15399158, + "balance_loss_mlp": 1.0222137, + "epoch": 0.4485059820838093, + "flos": 20374490207520.0, + "grad_norm": 2.9436637097052856, + "language_loss": 0.70814335, + "learning_rate": 2.4274483872219863e-06, + "loss": 0.73616946, + "num_input_tokens_seen": 80277270, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.62695312, + "step": 3730, + "time_per_iteration": 3.0221736431121826 + }, + { + "auxiliary_loss_clip": 0.01520422, + "auxiliary_loss_mlp": 0.01285108, + "balance_loss_clip": 1.15560293, + "balance_loss_mlp": 1.02551782, + "epoch": 0.4486262249744484, + "flos": 20049821879040.0, + "grad_norm": 2.009813965592249, + "language_loss": 0.93750829, + "learning_rate": 2.426687382058386e-06, + "loss": 0.9655636, + "num_input_tokens_seen": 80295550, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.59570312, + "step": 3731, + "time_per_iteration": 2.9544663429260254 + }, + { + "auxiliary_loss_clip": 0.01549075, + "auxiliary_loss_mlp": 0.01202744, + "balance_loss_clip": 1.18634701, + "balance_loss_mlp": 0.9914093, + "epoch": 0.4487464678650875, + "flos": 64602238140000.0, + "grad_norm": 0.8833019964202752, + "language_loss": 0.59790754, + "learning_rate": 2.425926312166649e-06, + "loss": 0.62542576, + "num_input_tokens_seen": 80348425, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.1171875, + "step": 3732, + "time_per_iteration": 3.3393733501434326 + }, + { + "auxiliary_loss_clip": 0.01516097, + "auxiliary_loss_mlp": 0.01285222, + "balance_loss_clip": 1.15212846, + "balance_loss_mlp": 1.02277136, + "epoch": 0.4488667107557266, + "flos": 20771488265760.0, + "grad_norm": 3.1266498273662147, + "language_loss": 0.73289865, + "learning_rate": 2.42516517766223e-06, + "loss": 0.76091182, + "num_input_tokens_seen": 80366505, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.625, + "step": 3733, + "time_per_iteration": 3.0030951499938965 + }, + { + "auxiliary_loss_clip": 0.01525178, + "auxiliary_loss_mlp": 0.012928, + "balance_loss_clip": 1.16082287, + "balance_loss_mlp": 1.02767873, + "epoch": 0.44898695364636565, + "flos": 23954033939040.0, + "grad_norm": 2.0410195956591792, + "language_loss": 0.68005508, + "learning_rate": 2.4244039786605907e-06, + "loss": 0.70823491, + "num_input_tokens_seen": 80387510, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.65234375, + "step": 3734, + "time_per_iteration": 3.099423885345459 + }, + { + "auxiliary_loss_clip": 0.01517281, + "auxiliary_loss_mlp": 0.01283476, + "balance_loss_clip": 1.15376115, + "balance_loss_mlp": 1.02064359, + "epoch": 0.44910719653700476, + "flos": 18626325897600.0, + "grad_norm": 2.3668709745675223, + "language_loss": 0.82329321, + "learning_rate": 2.4236427152772055e-06, + "loss": 0.85130078, + "num_input_tokens_seen": 80405915, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.62890625, + "step": 3735, + "time_per_iteration": 2.9976933002471924 + }, + { + "auxiliary_loss_clip": 0.01544862, + "auxiliary_loss_mlp": 0.01234245, + "balance_loss_clip": 1.18216872, + "balance_loss_mlp": 1.01909637, + "epoch": 0.4492274394276438, + "flos": 57039022124640.0, + "grad_norm": 0.837779433084597, + "language_loss": 0.57338685, + "learning_rate": 2.422881387627557e-06, + "loss": 0.60117793, + "num_input_tokens_seen": 80458365, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.15625, + "step": 3736, + "time_per_iteration": 3.1754119396209717 + }, + { + "auxiliary_loss_clip": 0.01524629, + "auxiliary_loss_mlp": 0.01286823, + "balance_loss_clip": 1.16088712, + "balance_loss_mlp": 1.02494395, + "epoch": 0.4493476823182829, + "flos": 23256945364320.0, + "grad_norm": 1.6165838028683883, + "language_loss": 0.77326399, + "learning_rate": 2.422119995827139e-06, + "loss": 0.80137849, + "num_input_tokens_seen": 80478490, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.61914062, + "step": 3737, + "time_per_iteration": 3.0759825706481934 + }, + { + "auxiliary_loss_clip": 0.01519612, + "auxiliary_loss_mlp": 0.01302572, + "balance_loss_clip": 1.15649676, + "balance_loss_mlp": 1.03897667, + "epoch": 0.44946792520892204, + "flos": 15816959033760.0, + "grad_norm": 3.251283578195856, + "language_loss": 0.73658884, + "learning_rate": 2.4213585399914528e-06, + "loss": 0.76481068, + "num_input_tokens_seen": 80495695, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.63671875, + "step": 3738, + "time_per_iteration": 3.1323039531707764 + }, + { + "auxiliary_loss_clip": 0.01521896, + "auxiliary_loss_mlp": 0.0128805, + "balance_loss_clip": 1.15769696, + "balance_loss_mlp": 1.0288415, + "epoch": 0.4495881680995611, + "flos": 19612126540800.0, + "grad_norm": 2.0150217070896126, + "language_loss": 0.85747707, + "learning_rate": 2.4205970202360113e-06, + "loss": 0.88557661, + "num_input_tokens_seen": 80515260, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.59179688, + "step": 3739, + "time_per_iteration": 3.0828945636749268 + }, + { + "auxiliary_loss_clip": 0.01521704, + "auxiliary_loss_mlp": 0.01296798, + "balance_loss_clip": 1.15787935, + "balance_loss_mlp": 1.03339314, + "epoch": 0.4497084109902002, + "flos": 26033883287040.0, + "grad_norm": 2.264022317666219, + "language_loss": 0.78408217, + "learning_rate": 2.4198354366763354e-06, + "loss": 0.81226718, + "num_input_tokens_seen": 80533900, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.63476562, + "step": 3740, + "time_per_iteration": 3.1209328174591064 + }, + { + "auxiliary_loss_clip": 0.01524193, + "auxiliary_loss_mlp": 0.01293955, + "balance_loss_clip": 1.15902793, + "balance_loss_mlp": 1.0334115, + "epoch": 0.4498286538808393, + "flos": 14795467627680.0, + "grad_norm": 5.161204136107993, + "language_loss": 0.78836155, + "learning_rate": 2.4190737894279587e-06, + "loss": 0.81654298, + "num_input_tokens_seen": 80551270, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.60546875, + "step": 3741, + "time_per_iteration": 3.1298677921295166 + }, + { + "auxiliary_loss_clip": 0.01521864, + "auxiliary_loss_mlp": 0.01285662, + "balance_loss_clip": 1.15806174, + "balance_loss_mlp": 1.02454567, + "epoch": 0.44994889677147837, + "flos": 15451972706880.0, + "grad_norm": 2.3184034560010303, + "language_loss": 0.80778289, + "learning_rate": 2.4183120786064203e-06, + "loss": 0.83585817, + "num_input_tokens_seen": 80568145, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.61132812, + "step": 3742, + "time_per_iteration": 3.07942533493042 + }, + { + "auxiliary_loss_clip": 0.01523551, + "auxiliary_loss_mlp": 0.01287502, + "balance_loss_clip": 1.16148663, + "balance_loss_mlp": 1.02600491, + "epoch": 0.4500691396621175, + "flos": 21800451519360.0, + "grad_norm": 2.4138958394182524, + "language_loss": 0.85340393, + "learning_rate": 2.417550304327273e-06, + "loss": 0.88151449, + "num_input_tokens_seen": 80586185, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.61523438, + "step": 3743, + "time_per_iteration": 3.0005557537078857 + }, + { + "auxiliary_loss_clip": 0.01523845, + "auxiliary_loss_mlp": 0.01291454, + "balance_loss_clip": 1.15885425, + "balance_loss_mlp": 1.0269047, + "epoch": 0.4501893825527566, + "flos": 32384903286240.0, + "grad_norm": 1.5768617693084952, + "language_loss": 0.75760305, + "learning_rate": 2.4167884667060763e-06, + "loss": 0.78575605, + "num_input_tokens_seen": 80608895, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.64648438, + "step": 3744, + "time_per_iteration": 3.1956920623779297 + }, + { + "auxiliary_loss_clip": 0.01521042, + "auxiliary_loss_mlp": 0.01298224, + "balance_loss_clip": 1.1578207, + "balance_loss_mlp": 1.03443766, + "epoch": 0.45030962544339564, + "flos": 16546817903040.0, + "grad_norm": 2.754640212755558, + "language_loss": 0.87362063, + "learning_rate": 2.4160265658584e-06, + "loss": 0.90181327, + "num_input_tokens_seen": 80623785, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.63867188, + "step": 3745, + "time_per_iteration": 2.912940502166748 + }, + { + "auxiliary_loss_clip": 0.01526957, + "auxiliary_loss_mlp": 0.01283952, + "balance_loss_clip": 1.16424644, + "balance_loss_mlp": 1.02226424, + "epoch": 0.45042986833403476, + "flos": 19575070364160.0, + "grad_norm": 2.63178428848641, + "language_loss": 0.68555796, + "learning_rate": 2.4152646018998253e-06, + "loss": 0.71366704, + "num_input_tokens_seen": 80642735, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.6171875, + "step": 3746, + "time_per_iteration": 3.0401036739349365 + }, + { + "auxiliary_loss_clip": 0.0151866, + "auxiliary_loss_mlp": 0.01283154, + "balance_loss_clip": 1.15481043, + "balance_loss_mlp": 1.02203834, + "epoch": 0.45055011122467387, + "flos": 23114713307040.0, + "grad_norm": 2.943431363894003, + "language_loss": 0.72287798, + "learning_rate": 2.4145025749459403e-06, + "loss": 0.7508961, + "num_input_tokens_seen": 80663760, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.61132812, + "step": 3747, + "time_per_iteration": 2.962087631225586 + }, + { + "auxiliary_loss_clip": 0.01518956, + "auxiliary_loss_mlp": 0.01284237, + "balance_loss_clip": 1.15509057, + "balance_loss_mlp": 1.02273941, + "epoch": 0.4506703541153129, + "flos": 19936491444000.0, + "grad_norm": 2.025416002931961, + "language_loss": 0.70576894, + "learning_rate": 2.413740485112344e-06, + "loss": 0.73380089, + "num_input_tokens_seen": 80682100, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.61523438, + "step": 3748, + "time_per_iteration": 2.988682746887207 + }, + { + "auxiliary_loss_clip": 0.01514889, + "auxiliary_loss_mlp": 0.0128669, + "balance_loss_clip": 1.15067744, + "balance_loss_mlp": 1.02748108, + "epoch": 0.45079059700595203, + "flos": 19501526933280.0, + "grad_norm": 1.623229937338135, + "language_loss": 0.82519215, + "learning_rate": 2.412978332514646e-06, + "loss": 0.85320801, + "num_input_tokens_seen": 80700880, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.59179688, + "step": 3749, + "time_per_iteration": 3.839973211288452 + }, + { + "auxiliary_loss_clip": 0.01521409, + "auxiliary_loss_mlp": 0.01296992, + "balance_loss_clip": 1.15693402, + "balance_loss_mlp": 1.03396845, + "epoch": 0.4509108398965911, + "flos": 27638791479360.0, + "grad_norm": 2.200290877608063, + "language_loss": 0.72062969, + "learning_rate": 2.4122161172684623e-06, + "loss": 0.74881369, + "num_input_tokens_seen": 80721675, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.63085938, + "step": 3750, + "time_per_iteration": 4.012926340103149 + }, + { + "auxiliary_loss_clip": 0.01516487, + "auxiliary_loss_mlp": 0.01290913, + "balance_loss_clip": 1.15210891, + "balance_loss_mlp": 1.02827072, + "epoch": 0.4510310827872302, + "flos": 20997807782400.0, + "grad_norm": 2.2478301914527474, + "language_loss": 0.84263217, + "learning_rate": 2.4114538394894216e-06, + "loss": 0.8707062, + "num_input_tokens_seen": 80739315, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.62695312, + "step": 3751, + "time_per_iteration": 2.9357986450195312 + }, + { + "auxiliary_loss_clip": 0.01514344, + "auxiliary_loss_mlp": 0.012834, + "balance_loss_clip": 1.14960635, + "balance_loss_mlp": 1.02247512, + "epoch": 0.4511513256778693, + "flos": 16218546399360.0, + "grad_norm": 1.9784957035872344, + "language_loss": 0.83024085, + "learning_rate": 2.410691499293161e-06, + "loss": 0.85821831, + "num_input_tokens_seen": 80757470, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.609375, + "step": 3752, + "time_per_iteration": 3.8117563724517822 + }, + { + "auxiliary_loss_clip": 0.01516477, + "auxiliary_loss_mlp": 0.01293053, + "balance_loss_clip": 1.15053618, + "balance_loss_mlp": 1.0306015, + "epoch": 0.45127156856850836, + "flos": 25188759646560.0, + "grad_norm": 1.9574307430977127, + "language_loss": 0.74093872, + "learning_rate": 2.409929096795326e-06, + "loss": 0.76903403, + "num_input_tokens_seen": 80777840, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.625, + "step": 3753, + "time_per_iteration": 3.0291364192962646 + }, + { + "auxiliary_loss_clip": 0.01518873, + "auxiliary_loss_mlp": 0.01287489, + "balance_loss_clip": 1.15391624, + "balance_loss_mlp": 1.02465653, + "epoch": 0.4513918114591475, + "flos": 20414618565120.0, + "grad_norm": 2.0716558947821193, + "language_loss": 0.79098403, + "learning_rate": 2.409166632111573e-06, + "loss": 0.81904769, + "num_input_tokens_seen": 80795975, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.62890625, + "step": 3754, + "time_per_iteration": 3.0411577224731445 + }, + { + "auxiliary_loss_clip": 0.01520972, + "auxiliary_loss_mlp": 0.01294384, + "balance_loss_clip": 1.15655053, + "balance_loss_mlp": 1.02830935, + "epoch": 0.4515120543497866, + "flos": 26650753074720.0, + "grad_norm": 2.109776331332985, + "language_loss": 0.80636883, + "learning_rate": 2.4084041053575674e-06, + "loss": 0.83452237, + "num_input_tokens_seen": 80815395, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.66210938, + "step": 3755, + "time_per_iteration": 3.8807754516601562 + }, + { + "auxiliary_loss_clip": 0.01518711, + "auxiliary_loss_mlp": 0.01294637, + "balance_loss_clip": 1.15322816, + "balance_loss_mlp": 1.0304687, + "epoch": 0.45163229724042564, + "flos": 20597206548960.0, + "grad_norm": 1.8911957436362858, + "language_loss": 0.72431606, + "learning_rate": 2.4076415166489834e-06, + "loss": 0.75244945, + "num_input_tokens_seen": 80834805, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.64257812, + "step": 3756, + "time_per_iteration": 3.0203211307525635 + }, + { + "auxiliary_loss_clip": 0.01515185, + "auxiliary_loss_mlp": 0.01287856, + "balance_loss_clip": 1.1499846, + "balance_loss_mlp": 1.02654886, + "epoch": 0.45175254013106475, + "flos": 21691141469280.0, + "grad_norm": 1.683676561550976, + "language_loss": 0.79126716, + "learning_rate": 2.406878866101506e-06, + "loss": 0.81929755, + "num_input_tokens_seen": 80853770, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.61328125, + "step": 3757, + "time_per_iteration": 3.031038522720337 + }, + { + "auxiliary_loss_clip": 0.01524582, + "auxiliary_loss_mlp": 0.01287923, + "balance_loss_clip": 1.15915513, + "balance_loss_mlp": 1.02718854, + "epoch": 0.45187278302170386, + "flos": 18880788473280.0, + "grad_norm": 2.12186416498952, + "language_loss": 0.78471231, + "learning_rate": 2.4061161538308273e-06, + "loss": 0.81283742, + "num_input_tokens_seen": 80870615, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.60742188, + "step": 3758, + "time_per_iteration": 3.0552167892456055 + }, + { + "auxiliary_loss_clip": 0.01519371, + "auxiliary_loss_mlp": 0.01282491, + "balance_loss_clip": 1.15469146, + "balance_loss_mlp": 1.02194667, + "epoch": 0.4519930259123429, + "flos": 18584149419360.0, + "grad_norm": 2.169432408041145, + "language_loss": 0.88534713, + "learning_rate": 2.4053533799526523e-06, + "loss": 0.91336572, + "num_input_tokens_seen": 80886335, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.60546875, + "step": 3759, + "time_per_iteration": 3.1173758506774902 + }, + { + "auxiliary_loss_clip": 0.01517842, + "auxiliary_loss_mlp": 0.01287471, + "balance_loss_clip": 1.15196276, + "balance_loss_mlp": 1.02749896, + "epoch": 0.452113268802982, + "flos": 25194752295840.0, + "grad_norm": 1.798030991944873, + "language_loss": 0.86398602, + "learning_rate": 2.404590544582691e-06, + "loss": 0.89203912, + "num_input_tokens_seen": 80904570, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.59960938, + "step": 3760, + "time_per_iteration": 3.094205141067505 + }, + { + "auxiliary_loss_clip": 0.01518985, + "auxiliary_loss_mlp": 0.01291244, + "balance_loss_clip": 1.15168262, + "balance_loss_mlp": 1.02974677, + "epoch": 0.45223351169362114, + "flos": 39381732623520.0, + "grad_norm": 1.8584886417690172, + "language_loss": 0.81088424, + "learning_rate": 2.403827647836666e-06, + "loss": 0.83898658, + "num_input_tokens_seen": 80925125, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 2.61523438, + "step": 3761, + "time_per_iteration": 3.2044148445129395 + }, + { + "auxiliary_loss_clip": 0.01512467, + "auxiliary_loss_mlp": 0.01284242, + "balance_loss_clip": 1.14828908, + "balance_loss_mlp": 1.02293551, + "epoch": 0.4523537545842602, + "flos": 21584448462240.0, + "grad_norm": 1.9026718156644915, + "language_loss": 0.69607186, + "learning_rate": 2.4030646898303075e-06, + "loss": 0.7240389, + "num_input_tokens_seen": 80946615, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.61328125, + "step": 3762, + "time_per_iteration": 3.2173712253570557 + }, + { + "auxiliary_loss_clip": 0.01510967, + "auxiliary_loss_mlp": 0.01283353, + "balance_loss_clip": 1.14335895, + "balance_loss_mlp": 1.02185595, + "epoch": 0.4524739974748993, + "flos": 28442079995040.0, + "grad_norm": 2.8688173353944753, + "language_loss": 0.81902945, + "learning_rate": 2.4023016706793566e-06, + "loss": 0.84697258, + "num_input_tokens_seen": 80966410, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.61523438, + "step": 3763, + "time_per_iteration": 3.198984384536743 + }, + { + "auxiliary_loss_clip": 0.0154155, + "auxiliary_loss_mlp": 0.01218605, + "balance_loss_clip": 1.17698526, + "balance_loss_mlp": 1.00498199, + "epoch": 0.4525942403655384, + "flos": 61562265877440.0, + "grad_norm": 0.7826860453873268, + "language_loss": 0.56825697, + "learning_rate": 2.401538590499561e-06, + "loss": 0.59585857, + "num_input_tokens_seen": 81026865, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.140625, + "step": 3764, + "time_per_iteration": 3.500378370285034 + }, + { + "auxiliary_loss_clip": 0.01511906, + "auxiliary_loss_mlp": 0.01288358, + "balance_loss_clip": 1.14662528, + "balance_loss_mlp": 1.02418983, + "epoch": 0.45271448325617747, + "flos": 27532477753920.0, + "grad_norm": 2.0598575655186493, + "language_loss": 0.71778035, + "learning_rate": 2.400775449406682e-06, + "loss": 0.74578297, + "num_input_tokens_seen": 81050060, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.64257812, + "step": 3765, + "time_per_iteration": 2.985867738723755 + }, + { + "auxiliary_loss_clip": 0.0151269, + "auxiliary_loss_mlp": 0.01282516, + "balance_loss_clip": 1.14731038, + "balance_loss_mlp": 1.0227356, + "epoch": 0.4528347261468166, + "flos": 22454642980800.0, + "grad_norm": 1.9388252453535606, + "language_loss": 0.73070031, + "learning_rate": 2.400012247516485e-06, + "loss": 0.75865245, + "num_input_tokens_seen": 81070625, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.59765625, + "step": 3766, + "time_per_iteration": 3.0811078548431396 + }, + { + "auxiliary_loss_clip": 0.01514244, + "auxiliary_loss_mlp": 0.01292891, + "balance_loss_clip": 1.14827335, + "balance_loss_mlp": 1.03005862, + "epoch": 0.45295496903745563, + "flos": 21105904131360.0, + "grad_norm": 1.9862248059324359, + "language_loss": 0.90391397, + "learning_rate": 2.3992489849447484e-06, + "loss": 0.93198532, + "num_input_tokens_seen": 81089080, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.62890625, + "step": 3767, + "time_per_iteration": 3.055708646774292 + }, + { + "auxiliary_loss_clip": 0.01511727, + "auxiliary_loss_mlp": 0.01287796, + "balance_loss_clip": 1.14816213, + "balance_loss_mlp": 1.02572596, + "epoch": 0.45307521192809475, + "flos": 23223378578400.0, + "grad_norm": 1.7334207222148301, + "language_loss": 0.7869187, + "learning_rate": 2.3984856618072584e-06, + "loss": 0.81491387, + "num_input_tokens_seen": 81109115, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.62109375, + "step": 3768, + "time_per_iteration": 2.998941659927368 + }, + { + "auxiliary_loss_clip": 0.01518386, + "auxiliary_loss_mlp": 0.01287577, + "balance_loss_clip": 1.15516019, + "balance_loss_mlp": 1.02779627, + "epoch": 0.45319545481873386, + "flos": 15561927535680.0, + "grad_norm": 2.2238641317830363, + "language_loss": 0.7384665, + "learning_rate": 2.3977222782198098e-06, + "loss": 0.76652616, + "num_input_tokens_seen": 81127750, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.59765625, + "step": 3769, + "time_per_iteration": 3.0672476291656494 + }, + { + "auxiliary_loss_clip": 0.01512876, + "auxiliary_loss_mlp": 0.01291358, + "balance_loss_clip": 1.14664054, + "balance_loss_mlp": 1.0289067, + "epoch": 0.4533156977093729, + "flos": 21946931530560.0, + "grad_norm": 2.356160144169248, + "language_loss": 0.75365651, + "learning_rate": 2.3969588342982077e-06, + "loss": 0.78169888, + "num_input_tokens_seen": 81147125, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.625, + "step": 3770, + "time_per_iteration": 2.993213653564453 + }, + { + "auxiliary_loss_clip": 0.0152222, + "auxiliary_loss_mlp": 0.0128231, + "balance_loss_clip": 1.15758097, + "balance_loss_mlp": 1.02024007, + "epoch": 0.453435940600012, + "flos": 24244414846560.0, + "grad_norm": 1.7502495514112806, + "language_loss": 0.72741842, + "learning_rate": 2.396195330158267e-06, + "loss": 0.75546372, + "num_input_tokens_seen": 81167015, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.62109375, + "step": 3771, + "time_per_iteration": 3.1381423473358154 + }, + { + "auxiliary_loss_clip": 0.01517567, + "auxiliary_loss_mlp": 0.01292906, + "balance_loss_clip": 1.15273893, + "balance_loss_mlp": 1.03178978, + "epoch": 0.45355618349065113, + "flos": 23442605529120.0, + "grad_norm": 2.001510655326784, + "language_loss": 0.79440904, + "learning_rate": 2.3954317659158094e-06, + "loss": 0.82251376, + "num_input_tokens_seen": 81187350, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.61132812, + "step": 3772, + "time_per_iteration": 3.0880324840545654 + }, + { + "auxiliary_loss_clip": 0.01536035, + "auxiliary_loss_mlp": 0.01207283, + "balance_loss_clip": 1.17237401, + "balance_loss_mlp": 0.99594879, + "epoch": 0.4536764263812902, + "flos": 66910341340800.0, + "grad_norm": 0.9075316805231038, + "language_loss": 0.56871074, + "learning_rate": 2.394668141686667e-06, + "loss": 0.59614396, + "num_input_tokens_seen": 81249315, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.1171875, + "step": 3773, + "time_per_iteration": 3.6126017570495605 + }, + { + "auxiliary_loss_clip": 0.01512654, + "auxiliary_loss_mlp": 0.01287989, + "balance_loss_clip": 1.14609706, + "balance_loss_mlp": 1.0287807, + "epoch": 0.4537966692719293, + "flos": 42742997608320.0, + "grad_norm": 2.2030880299251048, + "language_loss": 0.70138824, + "learning_rate": 2.3939044575866813e-06, + "loss": 0.72939467, + "num_input_tokens_seen": 81272065, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.59179688, + "step": 3774, + "time_per_iteration": 3.1961147785186768 + }, + { + "auxiliary_loss_clip": 0.01515972, + "auxiliary_loss_mlp": 0.01288081, + "balance_loss_clip": 1.15012288, + "balance_loss_mlp": 1.02734625, + "epoch": 0.4539169121625684, + "flos": 35552163911040.0, + "grad_norm": 2.2732597196829976, + "language_loss": 0.75826555, + "learning_rate": 2.3931407137317024e-06, + "loss": 0.78630602, + "num_input_tokens_seen": 81292220, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.60742188, + "step": 3775, + "time_per_iteration": 3.954468250274658 + }, + { + "auxiliary_loss_clip": 0.01515313, + "auxiliary_loss_mlp": 0.01285033, + "balance_loss_clip": 1.14949441, + "balance_loss_mlp": 1.0239172, + "epoch": 0.45403715505320746, + "flos": 18516674494080.0, + "grad_norm": 1.8067671571974455, + "language_loss": 0.85128832, + "learning_rate": 2.3923769102375907e-06, + "loss": 0.87929177, + "num_input_tokens_seen": 81311085, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.61132812, + "step": 3776, + "time_per_iteration": 3.9631597995758057 + }, + { + "auxiliary_loss_clip": 0.01511201, + "auxiliary_loss_mlp": 0.01283678, + "balance_loss_clip": 1.14638519, + "balance_loss_mlp": 1.02103615, + "epoch": 0.4541573979438466, + "flos": 25048158500160.0, + "grad_norm": 2.1535835932216165, + "language_loss": 0.7839182, + "learning_rate": 2.391613047220213e-06, + "loss": 0.811867, + "num_input_tokens_seen": 81330985, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.62695312, + "step": 3777, + "time_per_iteration": 3.136972188949585 + }, + { + "auxiliary_loss_clip": 0.0150858, + "auxiliary_loss_mlp": 0.01294018, + "balance_loss_clip": 1.14338434, + "balance_loss_mlp": 1.03042221, + "epoch": 0.4542776408344857, + "flos": 18334352007360.0, + "grad_norm": 1.9105421558901687, + "language_loss": 0.7891798, + "learning_rate": 2.390849124795447e-06, + "loss": 0.81720579, + "num_input_tokens_seen": 81346985, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.63671875, + "step": 3778, + "time_per_iteration": 3.279812812805176 + }, + { + "auxiliary_loss_clip": 0.015085, + "auxiliary_loss_mlp": 0.01293752, + "balance_loss_clip": 1.14231408, + "balance_loss_mlp": 1.03320849, + "epoch": 0.45439788372512474, + "flos": 20703103064640.0, + "grad_norm": 1.9112542431686137, + "language_loss": 0.84545946, + "learning_rate": 2.3900851430791804e-06, + "loss": 0.87348193, + "num_input_tokens_seen": 81365005, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.60546875, + "step": 3779, + "time_per_iteration": 3.340447425842285 + }, + { + "auxiliary_loss_clip": 0.01508957, + "auxiliary_loss_mlp": 0.01303108, + "balance_loss_clip": 1.14356768, + "balance_loss_mlp": 1.0379864, + "epoch": 0.45451812661576385, + "flos": 22311614432160.0, + "grad_norm": 2.3022103297118073, + "language_loss": 0.84834552, + "learning_rate": 2.389321102187307e-06, + "loss": 0.87646616, + "num_input_tokens_seen": 81383785, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.65234375, + "step": 3780, + "time_per_iteration": 3.834048271179199 + }, + { + "auxiliary_loss_clip": 0.01511089, + "auxiliary_loss_mlp": 0.0131308, + "balance_loss_clip": 1.14603174, + "balance_loss_mlp": 1.04853082, + "epoch": 0.4546383695064029, + "flos": 21765519319680.0, + "grad_norm": 1.899327113625782, + "language_loss": 0.81857133, + "learning_rate": 2.3885570022357326e-06, + "loss": 0.84681308, + "num_input_tokens_seen": 81402915, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.64648438, + "step": 3781, + "time_per_iteration": 3.1100082397460938 + }, + { + "auxiliary_loss_clip": 0.01538113, + "auxiliary_loss_mlp": 0.01270073, + "balance_loss_clip": 1.17433894, + "balance_loss_mlp": 1.05644989, + "epoch": 0.454758612397042, + "flos": 64249957746720.0, + "grad_norm": 0.8158180886042302, + "language_loss": 0.60806346, + "learning_rate": 2.38779284334037e-06, + "loss": 0.63614535, + "num_input_tokens_seen": 81467890, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.140625, + "step": 3782, + "time_per_iteration": 3.6128387451171875 + }, + { + "auxiliary_loss_clip": 0.01512296, + "auxiliary_loss_mlp": 0.01295084, + "balance_loss_clip": 1.14791703, + "balance_loss_mlp": 1.03301394, + "epoch": 0.4548788552876811, + "flos": 27306423734400.0, + "grad_norm": 2.4229188310862977, + "language_loss": 0.78787541, + "learning_rate": 2.387028625617141e-06, + "loss": 0.8159492, + "num_input_tokens_seen": 81487105, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.62109375, + "step": 3783, + "time_per_iteration": 3.852843999862671 + }, + { + "auxiliary_loss_clip": 0.01506335, + "auxiliary_loss_mlp": 0.01285052, + "balance_loss_clip": 1.13916874, + "balance_loss_mlp": 1.02412724, + "epoch": 0.4549990981783202, + "flos": 22859378383680.0, + "grad_norm": 1.8917910730076455, + "language_loss": 0.84528971, + "learning_rate": 2.3862643491819766e-06, + "loss": 0.87320358, + "num_input_tokens_seen": 81505670, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.609375, + "step": 3784, + "time_per_iteration": 3.001265525817871 + }, + { + "auxiliary_loss_clip": 0.01507102, + "auxiliary_loss_mlp": 0.01291416, + "balance_loss_clip": 1.1403594, + "balance_loss_mlp": 1.03220761, + "epoch": 0.4551193410689593, + "flos": 23260851964800.0, + "grad_norm": 2.3863641542685787, + "language_loss": 0.84347707, + "learning_rate": 2.3855000141508186e-06, + "loss": 0.87146223, + "num_input_tokens_seen": 81525825, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.59179688, + "step": 3785, + "time_per_iteration": 3.0029470920562744 + }, + { + "auxiliary_loss_clip": 0.01510367, + "auxiliary_loss_mlp": 0.01290626, + "balance_loss_clip": 1.14370716, + "balance_loss_mlp": 1.02970052, + "epoch": 0.4552395839595984, + "flos": 20779642820160.0, + "grad_norm": 3.6629478173066965, + "language_loss": 0.84146953, + "learning_rate": 2.3847356206396143e-06, + "loss": 0.86947942, + "num_input_tokens_seen": 81543135, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.609375, + "step": 3786, + "time_per_iteration": 2.975107431411743 + }, + { + "auxiliary_loss_clip": 0.01517518, + "auxiliary_loss_mlp": 0.01294285, + "balance_loss_clip": 1.15123248, + "balance_loss_mlp": 1.03450394, + "epoch": 0.45535982685023746, + "flos": 23259714120000.0, + "grad_norm": 1.610780422440362, + "language_loss": 0.78944147, + "learning_rate": 2.3839711687643227e-06, + "loss": 0.81755948, + "num_input_tokens_seen": 81564360, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.59765625, + "step": 3787, + "time_per_iteration": 3.053455114364624 + }, + { + "auxiliary_loss_clip": 0.01512257, + "auxiliary_loss_mlp": 0.01298232, + "balance_loss_clip": 1.14524424, + "balance_loss_mlp": 1.03349209, + "epoch": 0.45548006974087657, + "flos": 19648158657120.0, + "grad_norm": 2.4107180896580873, + "language_loss": 0.74768257, + "learning_rate": 2.38320665864091e-06, + "loss": 0.77578747, + "num_input_tokens_seen": 81583710, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.6484375, + "step": 3788, + "time_per_iteration": 3.0019569396972656 + }, + { + "auxiliary_loss_clip": 0.01503692, + "auxiliary_loss_mlp": 0.01293542, + "balance_loss_clip": 1.13628876, + "balance_loss_mlp": 1.03185368, + "epoch": 0.4556003126315157, + "flos": 20049859807200.0, + "grad_norm": 1.8932870505128467, + "language_loss": 0.82142514, + "learning_rate": 2.3824420903853516e-06, + "loss": 0.84939748, + "num_input_tokens_seen": 81602175, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.6171875, + "step": 3789, + "time_per_iteration": 2.9525070190429688 + }, + { + "auxiliary_loss_clip": 0.01521673, + "auxiliary_loss_mlp": 0.01298283, + "balance_loss_clip": 1.1555258, + "balance_loss_mlp": 1.03716707, + "epoch": 0.45572055552215474, + "flos": 22961595867840.0, + "grad_norm": 2.217569039460694, + "language_loss": 0.82082224, + "learning_rate": 2.3816774641136324e-06, + "loss": 0.84902185, + "num_input_tokens_seen": 81619430, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.61132812, + "step": 3790, + "time_per_iteration": 3.0063297748565674 + }, + { + "auxiliary_loss_clip": 0.01511459, + "auxiliary_loss_mlp": 0.01291343, + "balance_loss_clip": 1.14481568, + "balance_loss_mlp": 1.02984583, + "epoch": 0.45584079841279385, + "flos": 33112751963040.0, + "grad_norm": 2.0606911856619985, + "language_loss": 0.71206516, + "learning_rate": 2.380912779941745e-06, + "loss": 0.74009317, + "num_input_tokens_seen": 81642550, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.61523438, + "step": 3791, + "time_per_iteration": 3.0607478618621826 + }, + { + "auxiliary_loss_clip": 0.01510594, + "auxiliary_loss_mlp": 0.01295081, + "balance_loss_clip": 1.14510393, + "balance_loss_mlp": 1.0276711, + "epoch": 0.45596104130343296, + "flos": 27274677500160.0, + "grad_norm": 2.1584780776802215, + "language_loss": 0.8293156, + "learning_rate": 2.3801480379856918e-06, + "loss": 0.85737228, + "num_input_tokens_seen": 81664260, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.67578125, + "step": 3792, + "time_per_iteration": 3.1534225940704346 + }, + { + "auxiliary_loss_clip": 0.0151328, + "auxiliary_loss_mlp": 0.01283055, + "balance_loss_clip": 1.14730954, + "balance_loss_mlp": 1.01964998, + "epoch": 0.456081284194072, + "flos": 21581945203680.0, + "grad_norm": 1.8872738759532919, + "language_loss": 0.83614993, + "learning_rate": 2.379383238361484e-06, + "loss": 0.86411333, + "num_input_tokens_seen": 81683620, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.63476562, + "step": 3793, + "time_per_iteration": 3.161621332168579 + }, + { + "auxiliary_loss_clip": 0.0150488, + "auxiliary_loss_mlp": 0.01284337, + "balance_loss_clip": 1.13876319, + "balance_loss_mlp": 1.02474713, + "epoch": 0.4562015270847111, + "flos": 35921777473440.0, + "grad_norm": 1.8592326716071117, + "language_loss": 0.79793608, + "learning_rate": 2.3786183811851407e-06, + "loss": 0.82582819, + "num_input_tokens_seen": 81704325, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.59570312, + "step": 3794, + "time_per_iteration": 3.120246648788452 + }, + { + "auxiliary_loss_clip": 0.01510826, + "auxiliary_loss_mlp": 0.01294581, + "balance_loss_clip": 1.14333701, + "balance_loss_mlp": 1.03289318, + "epoch": 0.45632176997535023, + "flos": 13591502022240.0, + "grad_norm": 1.9250132102905282, + "language_loss": 0.80104041, + "learning_rate": 2.3778534665726892e-06, + "loss": 0.82909447, + "num_input_tokens_seen": 81721155, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.6171875, + "step": 3795, + "time_per_iteration": 3.017503499984741 + }, + { + "auxiliary_loss_clip": 0.01511637, + "auxiliary_loss_mlp": 0.01293122, + "balance_loss_clip": 1.14417362, + "balance_loss_mlp": 1.03391337, + "epoch": 0.4564420128659893, + "flos": 32638190088960.0, + "grad_norm": 2.4314832688066197, + "language_loss": 0.72824973, + "learning_rate": 2.377088494640168e-06, + "loss": 0.75629729, + "num_input_tokens_seen": 81742905, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.59179688, + "step": 3796, + "time_per_iteration": 3.08337140083313 + }, + { + "auxiliary_loss_clip": 0.01512893, + "auxiliary_loss_mlp": 0.01283905, + "balance_loss_clip": 1.14621902, + "balance_loss_mlp": 1.02488756, + "epoch": 0.4565622557566284, + "flos": 20379876006240.0, + "grad_norm": 2.0223432484875636, + "language_loss": 0.7811631, + "learning_rate": 2.3763234655036216e-06, + "loss": 0.80913103, + "num_input_tokens_seen": 81762105, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.58984375, + "step": 3797, + "time_per_iteration": 3.0808517932891846 + }, + { + "auxiliary_loss_clip": 0.01507903, + "auxiliary_loss_mlp": 0.01291351, + "balance_loss_clip": 1.14162862, + "balance_loss_mlp": 1.03042555, + "epoch": 0.45668249864726745, + "flos": 25376733429120.0, + "grad_norm": 2.08694476325629, + "language_loss": 0.87073863, + "learning_rate": 2.3755583792791046e-06, + "loss": 0.89873117, + "num_input_tokens_seen": 81781975, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.609375, + "step": 3798, + "time_per_iteration": 3.020953416824341 + }, + { + "auxiliary_loss_clip": 0.01511292, + "auxiliary_loss_mlp": 0.01292165, + "balance_loss_clip": 1.14383602, + "balance_loss_mlp": 1.0300951, + "epoch": 0.45680274153790656, + "flos": 15561624110400.0, + "grad_norm": 12.386146152993145, + "language_loss": 0.74495858, + "learning_rate": 2.3747932360826803e-06, + "loss": 0.77299321, + "num_input_tokens_seen": 81798905, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 2.62109375, + "step": 3799, + "time_per_iteration": 3.0951056480407715 + }, + { + "auxiliary_loss_clip": 0.01509277, + "auxiliary_loss_mlp": 0.01303884, + "balance_loss_clip": 1.14276922, + "balance_loss_mlp": 1.03952527, + "epoch": 0.4569229844285457, + "flos": 19794980021760.0, + "grad_norm": 2.377459070735946, + "language_loss": 0.82035333, + "learning_rate": 2.3740280360304205e-06, + "loss": 0.84848493, + "num_input_tokens_seen": 81816630, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.64453125, + "step": 3800, + "time_per_iteration": 3.0401105880737305 + }, + { + "auxiliary_loss_clip": 0.01510608, + "auxiliary_loss_mlp": 0.01286585, + "balance_loss_clip": 1.14450824, + "balance_loss_mlp": 1.0258503, + "epoch": 0.45704322731918473, + "flos": 24096038427360.0, + "grad_norm": 1.7428738451929697, + "language_loss": 0.68146324, + "learning_rate": 2.3732627792384038e-06, + "loss": 0.70943516, + "num_input_tokens_seen": 81837700, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.60742188, + "step": 3801, + "time_per_iteration": 3.085707664489746 + }, + { + "auxiliary_loss_clip": 0.01509594, + "auxiliary_loss_mlp": 0.01291671, + "balance_loss_clip": 1.14390075, + "balance_loss_mlp": 1.0288384, + "epoch": 0.45716347020982384, + "flos": 31320287197920.0, + "grad_norm": 1.8899421483921077, + "language_loss": 0.75539047, + "learning_rate": 2.3724974658227207e-06, + "loss": 0.78340304, + "num_input_tokens_seen": 81858490, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.62890625, + "step": 3802, + "time_per_iteration": 3.1332335472106934 + }, + { + "auxiliary_loss_clip": 0.01507765, + "auxiliary_loss_mlp": 0.01291362, + "balance_loss_clip": 1.13986003, + "balance_loss_mlp": 1.02986419, + "epoch": 0.45728371310046295, + "flos": 26503590356640.0, + "grad_norm": 1.8804305858502899, + "language_loss": 0.71543878, + "learning_rate": 2.3717320958994687e-06, + "loss": 0.74343002, + "num_input_tokens_seen": 81876050, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.61523438, + "step": 3803, + "time_per_iteration": 4.785062313079834 + }, + { + "auxiliary_loss_clip": 0.01508153, + "auxiliary_loss_mlp": 0.01282375, + "balance_loss_clip": 1.13818884, + "balance_loss_mlp": 1.02297521, + "epoch": 0.457403955991102, + "flos": 17931247515360.0, + "grad_norm": 9.038295994073174, + "language_loss": 0.70626819, + "learning_rate": 2.3709666695847534e-06, + "loss": 0.73417342, + "num_input_tokens_seen": 81894230, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 2.59375, + "step": 3804, + "time_per_iteration": 3.15073299407959 + }, + { + "auxiliary_loss_clip": 0.01512538, + "auxiliary_loss_mlp": 0.01291202, + "balance_loss_clip": 1.14425683, + "balance_loss_mlp": 1.02970469, + "epoch": 0.4575241988817411, + "flos": 42233958672480.0, + "grad_norm": 3.145756059823994, + "language_loss": 0.70626771, + "learning_rate": 2.370201186994689e-06, + "loss": 0.73430502, + "num_input_tokens_seen": 81917915, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.61523438, + "step": 3805, + "time_per_iteration": 3.149153709411621 + }, + { + "auxiliary_loss_clip": 0.01512311, + "auxiliary_loss_mlp": 0.01295287, + "balance_loss_clip": 1.14496422, + "balance_loss_mlp": 1.03264499, + "epoch": 0.45764444177238023, + "flos": 30119924767680.0, + "grad_norm": 1.8568811578545075, + "language_loss": 0.70366001, + "learning_rate": 2.369435648245399e-06, + "loss": 0.73173594, + "num_input_tokens_seen": 81938130, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.62695312, + "step": 3806, + "time_per_iteration": 3.028090715408325 + }, + { + "auxiliary_loss_clip": 0.01513376, + "auxiliary_loss_mlp": 0.01294359, + "balance_loss_clip": 1.14556313, + "balance_loss_mlp": 1.0309546, + "epoch": 0.4577646846630193, + "flos": 24062547497760.0, + "grad_norm": 1.8677613229808503, + "language_loss": 0.85130489, + "learning_rate": 2.368670053453015e-06, + "loss": 0.87938225, + "num_input_tokens_seen": 81959820, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 2.63476562, + "step": 3807, + "time_per_iteration": 3.8678295612335205 + }, + { + "auxiliary_loss_clip": 0.01521641, + "auxiliary_loss_mlp": 0.01298953, + "balance_loss_clip": 1.15405273, + "balance_loss_mlp": 1.03535795, + "epoch": 0.4578849275536584, + "flos": 17420388027840.0, + "grad_norm": 2.497360339650806, + "language_loss": 0.7472831, + "learning_rate": 2.3679044027336757e-06, + "loss": 0.77548909, + "num_input_tokens_seen": 81975710, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.63671875, + "step": 3808, + "time_per_iteration": 2.9756782054901123 + }, + { + "auxiliary_loss_clip": 0.01518804, + "auxiliary_loss_mlp": 0.01293497, + "balance_loss_clip": 1.15209365, + "balance_loss_mlp": 1.03009176, + "epoch": 0.4580051704442975, + "flos": 13511510804160.0, + "grad_norm": 2.9461700218902127, + "language_loss": 0.69419217, + "learning_rate": 2.3671386962035326e-06, + "loss": 0.72231519, + "num_input_tokens_seen": 81993180, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.63476562, + "step": 3809, + "time_per_iteration": 3.0911550521850586 + }, + { + "auxiliary_loss_clip": 0.01518727, + "auxiliary_loss_mlp": 0.01291722, + "balance_loss_clip": 1.152336, + "balance_loss_mlp": 1.02755404, + "epoch": 0.45812541333493656, + "flos": 18039381792480.0, + "grad_norm": 2.515891361695792, + "language_loss": 0.68642128, + "learning_rate": 2.3663729339787405e-06, + "loss": 0.71452582, + "num_input_tokens_seen": 82010115, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.64257812, + "step": 3810, + "time_per_iteration": 3.858096122741699 + }, + { + "auxiliary_loss_clip": 0.01527854, + "auxiliary_loss_mlp": 0.01292718, + "balance_loss_clip": 1.16139007, + "balance_loss_mlp": 1.02816892, + "epoch": 0.45824565622557567, + "flos": 20223838098720.0, + "grad_norm": 2.1974243662593813, + "language_loss": 0.73310006, + "learning_rate": 2.365607116175466e-06, + "loss": 0.76130581, + "num_input_tokens_seen": 82025540, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.64648438, + "step": 3811, + "time_per_iteration": 3.0378854274749756 + }, + { + "auxiliary_loss_clip": 0.01521043, + "auxiliary_loss_mlp": 0.01286177, + "balance_loss_clip": 1.15306759, + "balance_loss_mlp": 1.02849388, + "epoch": 0.4583658991162148, + "flos": 19866930469920.0, + "grad_norm": 2.5295724205317853, + "language_loss": 0.67129117, + "learning_rate": 2.3648412429098825e-06, + "loss": 0.69936335, + "num_input_tokens_seen": 82043890, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.57617188, + "step": 3812, + "time_per_iteration": 3.0114099979400635 + }, + { + "auxiliary_loss_clip": 0.01526842, + "auxiliary_loss_mlp": 0.0129571, + "balance_loss_clip": 1.16051471, + "balance_loss_mlp": 1.0324955, + "epoch": 0.45848614200685384, + "flos": 21031905562560.0, + "grad_norm": 2.176394166186154, + "language_loss": 0.82219708, + "learning_rate": 2.364075314298172e-06, + "loss": 0.85042262, + "num_input_tokens_seen": 82061345, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.6328125, + "step": 3813, + "time_per_iteration": 3.008837938308716 + }, + { + "auxiliary_loss_clip": 0.01534539, + "auxiliary_loss_mlp": 0.01284959, + "balance_loss_clip": 1.16787124, + "balance_loss_mlp": 1.02536893, + "epoch": 0.45860638489749295, + "flos": 21071427069600.0, + "grad_norm": 2.0214978610551575, + "language_loss": 0.70220518, + "learning_rate": 2.3633093304565267e-06, + "loss": 0.73040015, + "num_input_tokens_seen": 82080400, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.59570312, + "step": 3814, + "time_per_iteration": 3.087291717529297 + }, + { + "auxiliary_loss_clip": 0.01534505, + "auxiliary_loss_mlp": 0.01302701, + "balance_loss_clip": 1.1682539, + "balance_loss_mlp": 1.03891492, + "epoch": 0.458726627788132, + "flos": 26836337383200.0, + "grad_norm": 1.9749407467391513, + "language_loss": 0.62990272, + "learning_rate": 2.3625432915011443e-06, + "loss": 0.65827477, + "num_input_tokens_seen": 82102310, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.63867188, + "step": 3815, + "time_per_iteration": 3.1701982021331787 + }, + { + "auxiliary_loss_clip": 0.01528293, + "auxiliary_loss_mlp": 0.01294084, + "balance_loss_clip": 1.16329706, + "balance_loss_mlp": 1.03106022, + "epoch": 0.4588468706787711, + "flos": 24100248453120.0, + "grad_norm": 2.3228037062956512, + "language_loss": 0.6555934, + "learning_rate": 2.3617771975482334e-06, + "loss": 0.68381715, + "num_input_tokens_seen": 82121140, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.63085938, + "step": 3816, + "time_per_iteration": 3.1537418365478516 + }, + { + "auxiliary_loss_clip": 0.01534381, + "auxiliary_loss_mlp": 0.01288191, + "balance_loss_clip": 1.16999054, + "balance_loss_mlp": 1.02821982, + "epoch": 0.4589671135694102, + "flos": 17890967445120.0, + "grad_norm": 1.6200866856929454, + "language_loss": 0.74763787, + "learning_rate": 2.3610110487140083e-06, + "loss": 0.77586353, + "num_input_tokens_seen": 82139575, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.59960938, + "step": 3817, + "time_per_iteration": 3.1590776443481445 + }, + { + "auxiliary_loss_clip": 0.01523586, + "auxiliary_loss_mlp": 0.0128757, + "balance_loss_clip": 1.15625906, + "balance_loss_mlp": 1.02740788, + "epoch": 0.4590873564600493, + "flos": 25629185812320.0, + "grad_norm": 2.07601548466923, + "language_loss": 0.81213212, + "learning_rate": 2.360244845114695e-06, + "loss": 0.8402437, + "num_input_tokens_seen": 82159195, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 2.6015625, + "step": 3818, + "time_per_iteration": 3.0495264530181885 + }, + { + "auxiliary_loss_clip": 0.01528093, + "auxiliary_loss_mlp": 0.01293534, + "balance_loss_clip": 1.16129971, + "balance_loss_mlp": 1.03108335, + "epoch": 0.4592075993506884, + "flos": 18516750350400.0, + "grad_norm": 2.247666559488805, + "language_loss": 0.68479824, + "learning_rate": 2.3594785868665245e-06, + "loss": 0.7130146, + "num_input_tokens_seen": 82175500, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.625, + "step": 3819, + "time_per_iteration": 3.013756275177002 + }, + { + "auxiliary_loss_clip": 0.01530553, + "auxiliary_loss_mlp": 0.01313765, + "balance_loss_clip": 1.16542816, + "balance_loss_mlp": 1.04711795, + "epoch": 0.4593278422413275, + "flos": 20633200737120.0, + "grad_norm": 2.3548065972079804, + "language_loss": 0.8096168, + "learning_rate": 2.3587122740857386e-06, + "loss": 0.83806002, + "num_input_tokens_seen": 82192600, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.66796875, + "step": 3820, + "time_per_iteration": 3.1317286491394043 + }, + { + "auxiliary_loss_clip": 0.01526082, + "auxiliary_loss_mlp": 0.01280048, + "balance_loss_clip": 1.159832, + "balance_loss_mlp": 1.02045751, + "epoch": 0.45944808513196655, + "flos": 21360518419680.0, + "grad_norm": 1.7810915708212487, + "language_loss": 0.78021491, + "learning_rate": 2.357945906888586e-06, + "loss": 0.80827624, + "num_input_tokens_seen": 82212040, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.59570312, + "step": 3821, + "time_per_iteration": 3.087270498275757 + }, + { + "auxiliary_loss_clip": 0.01532405, + "auxiliary_loss_mlp": 0.01290565, + "balance_loss_clip": 1.16644979, + "balance_loss_mlp": 1.02735066, + "epoch": 0.45956832802260567, + "flos": 21429586327680.0, + "grad_norm": 2.3651233764502324, + "language_loss": 0.7977953, + "learning_rate": 2.357179485391324e-06, + "loss": 0.82602507, + "num_input_tokens_seen": 82229895, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.6328125, + "step": 3822, + "time_per_iteration": 3.144059896469116 + }, + { + "auxiliary_loss_clip": 0.01533936, + "auxiliary_loss_mlp": 0.01289186, + "balance_loss_clip": 1.1679213, + "balance_loss_mlp": 1.02692568, + "epoch": 0.4596885709132448, + "flos": 22384854437760.0, + "grad_norm": 1.970729666839233, + "language_loss": 0.8642869, + "learning_rate": 2.3564130097102173e-06, + "loss": 0.89251804, + "num_input_tokens_seen": 82249550, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.62304688, + "step": 3823, + "time_per_iteration": 3.023632049560547 + }, + { + "auxiliary_loss_clip": 0.0153527, + "auxiliary_loss_mlp": 0.01293956, + "balance_loss_clip": 1.16903067, + "balance_loss_mlp": 1.03264928, + "epoch": 0.45980881380388383, + "flos": 28984268507040.0, + "grad_norm": 1.8477520403001748, + "language_loss": 0.75193733, + "learning_rate": 2.355646479961541e-06, + "loss": 0.78022951, + "num_input_tokens_seen": 82268860, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.61328125, + "step": 3824, + "time_per_iteration": 3.084108352661133 + }, + { + "auxiliary_loss_clip": 0.01524614, + "auxiliary_loss_mlp": 0.0129175, + "balance_loss_clip": 1.15886629, + "balance_loss_mlp": 1.02910805, + "epoch": 0.45992905669452294, + "flos": 33399150413760.0, + "grad_norm": 2.218671504802974, + "language_loss": 0.71769392, + "learning_rate": 2.354879896261576e-06, + "loss": 0.7458576, + "num_input_tokens_seen": 82289070, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.62695312, + "step": 3825, + "time_per_iteration": 3.077056407928467 + }, + { + "auxiliary_loss_clip": 0.01527393, + "auxiliary_loss_mlp": 0.01288737, + "balance_loss_clip": 1.16132641, + "balance_loss_mlp": 1.02914667, + "epoch": 0.46004929958516205, + "flos": 36321051221280.0, + "grad_norm": 2.2758896592438753, + "language_loss": 0.57037878, + "learning_rate": 2.3541132587266133e-06, + "loss": 0.59854007, + "num_input_tokens_seen": 82311790, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.59570312, + "step": 3826, + "time_per_iteration": 3.0964348316192627 + }, + { + "auxiliary_loss_clip": 0.01532218, + "auxiliary_loss_mlp": 0.01292789, + "balance_loss_clip": 1.16657603, + "balance_loss_mlp": 1.03186393, + "epoch": 0.4601695424758011, + "flos": 17240379158880.0, + "grad_norm": 2.0305139216685153, + "language_loss": 0.69684094, + "learning_rate": 2.3533465674729515e-06, + "loss": 0.72509098, + "num_input_tokens_seen": 82329020, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.609375, + "step": 3827, + "time_per_iteration": 3.0143392086029053 + }, + { + "auxiliary_loss_clip": 0.01532333, + "auxiliary_loss_mlp": 0.01290828, + "balance_loss_clip": 1.16530776, + "balance_loss_mlp": 1.02818608, + "epoch": 0.4602897853664402, + "flos": 15890274895680.0, + "grad_norm": 2.0776482423267972, + "language_loss": 0.7302891, + "learning_rate": 2.352579822616895e-06, + "loss": 0.75852072, + "num_input_tokens_seen": 82346455, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.62695312, + "step": 3828, + "time_per_iteration": 3.0334396362304688 + }, + { + "auxiliary_loss_clip": 0.01526953, + "auxiliary_loss_mlp": 0.01284905, + "balance_loss_clip": 1.16151702, + "balance_loss_mlp": 1.02245414, + "epoch": 0.4604100282570793, + "flos": 25415079163200.0, + "grad_norm": 1.8862188174899612, + "language_loss": 0.78123766, + "learning_rate": 2.351813024274761e-06, + "loss": 0.80935621, + "num_input_tokens_seen": 82367810, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.625, + "step": 3829, + "time_per_iteration": 3.0336315631866455 + }, + { + "auxiliary_loss_clip": 0.01534082, + "auxiliary_loss_mlp": 0.01298469, + "balance_loss_clip": 1.16684186, + "balance_loss_mlp": 1.03697193, + "epoch": 0.4605302711477184, + "flos": 27632874686400.0, + "grad_norm": 1.772854316444879, + "language_loss": 0.73360437, + "learning_rate": 2.3510461725628693e-06, + "loss": 0.76192987, + "num_input_tokens_seen": 82388275, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.61523438, + "step": 3830, + "time_per_iteration": 3.9071643352508545 + }, + { + "auxiliary_loss_clip": 0.01527234, + "auxiliary_loss_mlp": 0.0128701, + "balance_loss_clip": 1.16126108, + "balance_loss_mlp": 1.02474976, + "epoch": 0.4606505140383575, + "flos": 23841803420640.0, + "grad_norm": 2.059169580727772, + "language_loss": 0.71138072, + "learning_rate": 2.350279267597554e-06, + "loss": 0.73952317, + "num_input_tokens_seen": 82408915, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.62304688, + "step": 3831, + "time_per_iteration": 3.8834028244018555 + }, + { + "auxiliary_loss_clip": 0.01527429, + "auxiliary_loss_mlp": 0.01296215, + "balance_loss_clip": 1.16182923, + "balance_loss_mlp": 1.03414536, + "epoch": 0.46077075692899655, + "flos": 16108857067680.0, + "grad_norm": 3.492109887939736, + "language_loss": 0.82705444, + "learning_rate": 2.3495123094951515e-06, + "loss": 0.85529089, + "num_input_tokens_seen": 82427260, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.62109375, + "step": 3832, + "time_per_iteration": 2.9774904251098633 + }, + { + "auxiliary_loss_clip": 0.01525429, + "auxiliary_loss_mlp": 0.01300978, + "balance_loss_clip": 1.15777707, + "balance_loss_mlp": 1.03967166, + "epoch": 0.46089099981963566, + "flos": 48801816148320.0, + "grad_norm": 2.176439742347192, + "language_loss": 0.76297379, + "learning_rate": 2.34874529837201e-06, + "loss": 0.79123783, + "num_input_tokens_seen": 82450805, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.61328125, + "step": 3833, + "time_per_iteration": 3.274388313293457 + }, + { + "auxiliary_loss_clip": 0.01526007, + "auxiliary_loss_mlp": 0.01287168, + "balance_loss_clip": 1.15798628, + "balance_loss_mlp": 1.02795899, + "epoch": 0.46101124271027477, + "flos": 19101267053280.0, + "grad_norm": 1.9491325077241624, + "language_loss": 0.78779542, + "learning_rate": 2.347978234344483e-06, + "loss": 0.81592715, + "num_input_tokens_seen": 82467010, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 2.59179688, + "step": 3834, + "time_per_iteration": 3.911409616470337 + }, + { + "auxiliary_loss_clip": 0.0153057, + "auxiliary_loss_mlp": 0.0130945, + "balance_loss_clip": 1.16303408, + "balance_loss_mlp": 1.04509127, + "epoch": 0.4611314856009138, + "flos": 39351048377760.0, + "grad_norm": 1.6746229462832851, + "language_loss": 0.69106054, + "learning_rate": 2.347211117528935e-06, + "loss": 0.71946079, + "num_input_tokens_seen": 82489310, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.64453125, + "step": 3835, + "time_per_iteration": 3.2016842365264893 + }, + { + "auxiliary_loss_clip": 0.01537666, + "auxiliary_loss_mlp": 0.01298857, + "balance_loss_clip": 1.17209435, + "balance_loss_mlp": 1.03564322, + "epoch": 0.46125172849155294, + "flos": 20812716540000.0, + "grad_norm": 1.7401973712622996, + "language_loss": 0.71730191, + "learning_rate": 2.3464439480417374e-06, + "loss": 0.7456671, + "num_input_tokens_seen": 82508830, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.6328125, + "step": 3836, + "time_per_iteration": 3.0713694095611572 + }, + { + "auxiliary_loss_clip": 0.0152999, + "auxiliary_loss_mlp": 0.01296345, + "balance_loss_clip": 1.16166008, + "balance_loss_mlp": 1.03389359, + "epoch": 0.46137197138219205, + "flos": 17932726713600.0, + "grad_norm": 2.7664378983791016, + "language_loss": 0.7748301, + "learning_rate": 2.3456767259992676e-06, + "loss": 0.80309349, + "num_input_tokens_seen": 82526475, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 2.625, + "step": 3837, + "time_per_iteration": 3.8090083599090576 + }, + { + "auxiliary_loss_clip": 0.01521453, + "auxiliary_loss_mlp": 0.01289449, + "balance_loss_clip": 1.1536932, + "balance_loss_mlp": 1.02814221, + "epoch": 0.4614922142728311, + "flos": 16838488368000.0, + "grad_norm": 3.310906684847211, + "language_loss": 0.88889498, + "learning_rate": 2.3449094515179135e-06, + "loss": 0.91700399, + "num_input_tokens_seen": 82543935, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.61328125, + "step": 3838, + "time_per_iteration": 3.1638853549957275 + }, + { + "auxiliary_loss_clip": 0.01523512, + "auxiliary_loss_mlp": 0.01283207, + "balance_loss_clip": 1.15716124, + "balance_loss_mlp": 1.02285349, + "epoch": 0.4616124571634702, + "flos": 26617338001440.0, + "grad_norm": 2.073814036620247, + "language_loss": 0.81677949, + "learning_rate": 2.34414212471407e-06, + "loss": 0.84484673, + "num_input_tokens_seen": 82563730, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.60351562, + "step": 3839, + "time_per_iteration": 3.381136178970337 + }, + { + "auxiliary_loss_clip": 0.01520179, + "auxiliary_loss_mlp": 0.01289978, + "balance_loss_clip": 1.15262043, + "balance_loss_mlp": 1.02714539, + "epoch": 0.4617327000541093, + "flos": 20342250907200.0, + "grad_norm": 5.0595609773830805, + "language_loss": 0.73034263, + "learning_rate": 2.3433747457041394e-06, + "loss": 0.75844425, + "num_input_tokens_seen": 82582435, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.62890625, + "step": 3840, + "time_per_iteration": 3.122537851333618 + }, + { + "auxiliary_loss_clip": 0.01523173, + "auxiliary_loss_mlp": 0.01293521, + "balance_loss_clip": 1.15615654, + "balance_loss_mlp": 1.03354955, + "epoch": 0.4618529429447484, + "flos": 29573374517280.0, + "grad_norm": 1.9062004961173984, + "language_loss": 0.84978294, + "learning_rate": 2.342607314604533e-06, + "loss": 0.87794989, + "num_input_tokens_seen": 82602185, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.59960938, + "step": 3841, + "time_per_iteration": 3.1295042037963867 + }, + { + "auxiliary_loss_clip": 0.01517077, + "auxiliary_loss_mlp": 0.01297863, + "balance_loss_clip": 1.14938438, + "balance_loss_mlp": 1.03560305, + "epoch": 0.4619731858353875, + "flos": 19788797731680.0, + "grad_norm": 2.353266940789008, + "language_loss": 0.83907735, + "learning_rate": 2.3418398315316694e-06, + "loss": 0.86722672, + "num_input_tokens_seen": 82620005, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.62304688, + "step": 3842, + "time_per_iteration": 3.0841407775878906 + }, + { + "auxiliary_loss_clip": 0.01521903, + "auxiliary_loss_mlp": 0.01291811, + "balance_loss_clip": 1.15403426, + "balance_loss_mlp": 1.03183973, + "epoch": 0.4620934287260266, + "flos": 18953042346720.0, + "grad_norm": 2.8486262577174433, + "language_loss": 0.78560865, + "learning_rate": 2.3410722966019755e-06, + "loss": 0.81374586, + "num_input_tokens_seen": 82635120, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 2.59960938, + "step": 3843, + "time_per_iteration": 3.115201950073242 + }, + { + "auxiliary_loss_clip": 0.01516692, + "auxiliary_loss_mlp": 0.01289551, + "balance_loss_clip": 1.14935398, + "balance_loss_mlp": 1.0259552, + "epoch": 0.46221367161666566, + "flos": 37344704532480.0, + "grad_norm": 14.191934676608277, + "language_loss": 0.66075683, + "learning_rate": 2.3403047099318848e-06, + "loss": 0.68881929, + "num_input_tokens_seen": 82659190, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.63671875, + "step": 3844, + "time_per_iteration": 3.248389482498169 + }, + { + "auxiliary_loss_clip": 0.01517144, + "auxiliary_loss_mlp": 0.01279806, + "balance_loss_clip": 1.15091205, + "balance_loss_mlp": 1.01849902, + "epoch": 0.46233391450730477, + "flos": 14430291660000.0, + "grad_norm": 2.226345896280632, + "language_loss": 0.75452757, + "learning_rate": 2.3395370716378405e-06, + "loss": 0.78249705, + "num_input_tokens_seen": 82676635, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.61328125, + "step": 3845, + "time_per_iteration": 3.0428080558776855 + }, + { + "auxiliary_loss_clip": 0.01515025, + "auxiliary_loss_mlp": 0.01314622, + "balance_loss_clip": 1.14654326, + "balance_loss_mlp": 1.05083537, + "epoch": 0.4624541573979438, + "flos": 22495302332640.0, + "grad_norm": 2.430878377752031, + "language_loss": 0.72721565, + "learning_rate": 2.338769381836292e-06, + "loss": 0.75551212, + "num_input_tokens_seen": 82696245, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 2.63867188, + "step": 3846, + "time_per_iteration": 3.0506861209869385 + }, + { + "auxiliary_loss_clip": 0.0151731, + "auxiliary_loss_mlp": 0.01297546, + "balance_loss_clip": 1.15016317, + "balance_loss_mlp": 1.03547597, + "epoch": 0.46257440028858293, + "flos": 14466816842400.0, + "grad_norm": 2.134187184986029, + "language_loss": 0.73475939, + "learning_rate": 2.3380016406436984e-06, + "loss": 0.76290792, + "num_input_tokens_seen": 82713725, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.62109375, + "step": 3847, + "time_per_iteration": 3.000605821609497 + }, + { + "auxiliary_loss_clip": 0.01519035, + "auxiliary_loss_mlp": 0.01278718, + "balance_loss_clip": 1.15304303, + "balance_loss_mlp": 1.01702952, + "epoch": 0.46269464317922204, + "flos": 23334509180160.0, + "grad_norm": 2.1384299998576854, + "language_loss": 0.81543189, + "learning_rate": 2.337233848176524e-06, + "loss": 0.84340942, + "num_input_tokens_seen": 82731495, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.6171875, + "step": 3848, + "time_per_iteration": 3.066821575164795 + }, + { + "auxiliary_loss_clip": 0.01526735, + "auxiliary_loss_mlp": 0.01308902, + "balance_loss_clip": 1.15942717, + "balance_loss_mlp": 1.04587901, + "epoch": 0.4628148860698611, + "flos": 18554034096000.0, + "grad_norm": 2.6376109710946487, + "language_loss": 0.83490574, + "learning_rate": 2.3364660045512435e-06, + "loss": 0.86326206, + "num_input_tokens_seen": 82750255, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.63085938, + "step": 3849, + "time_per_iteration": 3.1151845455169678 + }, + { + "auxiliary_loss_clip": 0.01543177, + "auxiliary_loss_mlp": 0.0120356, + "balance_loss_clip": 1.18241262, + "balance_loss_mlp": 0.98993683, + "epoch": 0.4629351289605002, + "flos": 70675317668160.0, + "grad_norm": 0.7454651314929001, + "language_loss": 0.58134162, + "learning_rate": 2.335698109884337e-06, + "loss": 0.60880893, + "num_input_tokens_seen": 82815460, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.140625, + "step": 3850, + "time_per_iteration": 3.639984607696533 + }, + { + "auxiliary_loss_clip": 0.01541083, + "auxiliary_loss_mlp": 0.01201073, + "balance_loss_clip": 1.17988086, + "balance_loss_mlp": 0.98744965, + "epoch": 0.4630553718511393, + "flos": 59694436765440.0, + "grad_norm": 0.789532604115812, + "language_loss": 0.59859955, + "learning_rate": 2.334930164292294e-06, + "loss": 0.62602115, + "num_input_tokens_seen": 82878010, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.140625, + "step": 3851, + "time_per_iteration": 3.5598323345184326 + }, + { + "auxiliary_loss_clip": 0.01519763, + "auxiliary_loss_mlp": 0.01285332, + "balance_loss_clip": 1.15381193, + "balance_loss_mlp": 1.02345347, + "epoch": 0.4631756147417784, + "flos": 15962414984640.0, + "grad_norm": 2.733609162300005, + "language_loss": 0.80197096, + "learning_rate": 2.334162167891612e-06, + "loss": 0.83002198, + "num_input_tokens_seen": 82895275, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.61914062, + "step": 3852, + "time_per_iteration": 3.0748679637908936 + }, + { + "auxiliary_loss_clip": 0.01521844, + "auxiliary_loss_mlp": 0.01288542, + "balance_loss_clip": 1.15469897, + "balance_loss_mlp": 1.02609062, + "epoch": 0.4632958576324175, + "flos": 16474829526720.0, + "grad_norm": 2.2782966900768393, + "language_loss": 0.75601685, + "learning_rate": 2.333394120798795e-06, + "loss": 0.78412068, + "num_input_tokens_seen": 82914010, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 2.625, + "step": 3853, + "time_per_iteration": 3.1164355278015137 + }, + { + "auxiliary_loss_clip": 0.01514524, + "auxiliary_loss_mlp": 0.01294252, + "balance_loss_clip": 1.14813793, + "balance_loss_mlp": 1.03161013, + "epoch": 0.4634161005230566, + "flos": 22348708536960.0, + "grad_norm": 3.452708357776594, + "language_loss": 0.72658145, + "learning_rate": 2.3326260231303545e-06, + "loss": 0.75466913, + "num_input_tokens_seen": 82932610, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.62695312, + "step": 3854, + "time_per_iteration": 3.0673861503601074 + }, + { + "auxiliary_loss_clip": 0.01519865, + "auxiliary_loss_mlp": 0.01286703, + "balance_loss_clip": 1.15404773, + "balance_loss_mlp": 1.02806664, + "epoch": 0.46353634341369565, + "flos": 15744060381600.0, + "grad_norm": 1.7688825397876797, + "language_loss": 0.86733401, + "learning_rate": 2.331857875002811e-06, + "loss": 0.89539969, + "num_input_tokens_seen": 82951210, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.5859375, + "step": 3855, + "time_per_iteration": 3.037034511566162 + }, + { + "auxiliary_loss_clip": 0.01527413, + "auxiliary_loss_mlp": 0.01307477, + "balance_loss_clip": 1.16064, + "balance_loss_mlp": 1.04731441, + "epoch": 0.46365658630433476, + "flos": 28332352735200.0, + "grad_norm": 1.8406780227811546, + "language_loss": 0.76245755, + "learning_rate": 2.3310896765326916e-06, + "loss": 0.79080641, + "num_input_tokens_seen": 82972210, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.6015625, + "step": 3856, + "time_per_iteration": 3.101064443588257 + }, + { + "auxiliary_loss_clip": 0.01521621, + "auxiliary_loss_mlp": 0.01304104, + "balance_loss_clip": 1.15313148, + "balance_loss_mlp": 1.04108047, + "epoch": 0.46377682919497387, + "flos": 24610387305600.0, + "grad_norm": 1.6854076574745558, + "language_loss": 0.84181893, + "learning_rate": 2.330321427836531e-06, + "loss": 0.87007618, + "num_input_tokens_seen": 82994080, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 2.63085938, + "step": 3857, + "time_per_iteration": 3.9046623706817627 + }, + { + "auxiliary_loss_clip": 0.01517919, + "auxiliary_loss_mlp": 0.01293261, + "balance_loss_clip": 1.15078282, + "balance_loss_mlp": 1.0321449, + "epoch": 0.4638970720856129, + "flos": 19062883391040.0, + "grad_norm": 4.447976088724934, + "language_loss": 0.82559562, + "learning_rate": 2.3295531290308733e-06, + "loss": 0.85370743, + "num_input_tokens_seen": 83012230, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.61132812, + "step": 3858, + "time_per_iteration": 4.028714418411255 + }, + { + "auxiliary_loss_clip": 0.01526517, + "auxiliary_loss_mlp": 0.01305224, + "balance_loss_clip": 1.16124558, + "balance_loss_mlp": 1.0387671, + "epoch": 0.46401731497625204, + "flos": 18472563679680.0, + "grad_norm": 5.24469699870044, + "language_loss": 0.75716227, + "learning_rate": 2.3287847802322678e-06, + "loss": 0.78547966, + "num_input_tokens_seen": 83027800, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.66601562, + "step": 3859, + "time_per_iteration": 3.0351107120513916 + }, + { + "auxiliary_loss_clip": 0.01526309, + "auxiliary_loss_mlp": 0.01304613, + "balance_loss_clip": 1.16044068, + "balance_loss_mlp": 1.0370121, + "epoch": 0.4641375578668911, + "flos": 26069422337280.0, + "grad_norm": 1.7581132063644813, + "language_loss": 0.8420018, + "learning_rate": 2.3280163815572723e-06, + "loss": 0.87031102, + "num_input_tokens_seen": 83048395, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.67773438, + "step": 3860, + "time_per_iteration": 3.1413683891296387 + }, + { + "auxiliary_loss_clip": 0.01519273, + "auxiliary_loss_mlp": 0.01298382, + "balance_loss_clip": 1.15243673, + "balance_loss_mlp": 1.03478622, + "epoch": 0.4642578007575302, + "flos": 19572263680320.0, + "grad_norm": 1.9485170375048309, + "language_loss": 0.77354002, + "learning_rate": 2.3272479331224522e-06, + "loss": 0.80171657, + "num_input_tokens_seen": 83065825, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.63671875, + "step": 3861, + "time_per_iteration": 4.034740209579468 + }, + { + "auxiliary_loss_clip": 0.01517732, + "auxiliary_loss_mlp": 0.01299943, + "balance_loss_clip": 1.14960527, + "balance_loss_mlp": 1.03711057, + "epoch": 0.4643780436481693, + "flos": 28188906976800.0, + "grad_norm": 1.6345318822640602, + "language_loss": 0.78315413, + "learning_rate": 2.3264794350443817e-06, + "loss": 0.81133091, + "num_input_tokens_seen": 83087920, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 2.62890625, + "step": 3862, + "time_per_iteration": 3.122213363647461 + }, + { + "auxiliary_loss_clip": 0.01524886, + "auxiliary_loss_mlp": 0.0129687, + "balance_loss_clip": 1.15876341, + "balance_loss_mlp": 1.03556371, + "epoch": 0.46449828653880837, + "flos": 25377681633120.0, + "grad_norm": 1.9759884854297183, + "language_loss": 0.78621453, + "learning_rate": 2.3257108874396396e-06, + "loss": 0.81443208, + "num_input_tokens_seen": 83109015, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.61328125, + "step": 3863, + "time_per_iteration": 3.0951409339904785 + }, + { + "auxiliary_loss_clip": 0.01518235, + "auxiliary_loss_mlp": 0.01294101, + "balance_loss_clip": 1.15114975, + "balance_loss_mlp": 1.03393817, + "epoch": 0.4646185294294475, + "flos": 16036185984480.0, + "grad_norm": 2.778275585164774, + "language_loss": 0.73268944, + "learning_rate": 2.3249422904248152e-06, + "loss": 0.76081276, + "num_input_tokens_seen": 83127450, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.6015625, + "step": 3864, + "time_per_iteration": 3.0871410369873047 + }, + { + "auxiliary_loss_clip": 0.01516923, + "auxiliary_loss_mlp": 0.01289816, + "balance_loss_clip": 1.15016448, + "balance_loss_mlp": 1.0275557, + "epoch": 0.4647387723200866, + "flos": 26365720037760.0, + "grad_norm": 1.4706953599536798, + "language_loss": 0.87066925, + "learning_rate": 2.324173644116504e-06, + "loss": 0.89873666, + "num_input_tokens_seen": 83150300, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.62304688, + "step": 3865, + "time_per_iteration": 3.870681047439575 + }, + { + "auxiliary_loss_clip": 0.01526433, + "auxiliary_loss_mlp": 0.01305735, + "balance_loss_clip": 1.15820193, + "balance_loss_mlp": 1.04271162, + "epoch": 0.46485901521072565, + "flos": 27162560766240.0, + "grad_norm": 1.9298719089980978, + "language_loss": 0.81414175, + "learning_rate": 2.3234049486313087e-06, + "loss": 0.84246337, + "num_input_tokens_seen": 83171750, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 2.63085938, + "step": 3866, + "time_per_iteration": 3.1035473346710205 + }, + { + "auxiliary_loss_clip": 0.01515651, + "auxiliary_loss_mlp": 0.01296295, + "balance_loss_clip": 1.14775491, + "balance_loss_mlp": 1.03670478, + "epoch": 0.46497925810136476, + "flos": 24282153730080.0, + "grad_norm": 2.7642670978090327, + "language_loss": 0.76336223, + "learning_rate": 2.322636204085839e-06, + "loss": 0.79148167, + "num_input_tokens_seen": 83191820, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.59570312, + "step": 3867, + "time_per_iteration": 3.0979223251342773 + }, + { + "auxiliary_loss_clip": 0.01520613, + "auxiliary_loss_mlp": 0.01283521, + "balance_loss_clip": 1.15316415, + "balance_loss_mlp": 1.02354932, + "epoch": 0.46509950099200387, + "flos": 16254995725440.0, + "grad_norm": 2.1899098788236713, + "language_loss": 0.79169834, + "learning_rate": 2.3218674105967143e-06, + "loss": 0.8197397, + "num_input_tokens_seen": 83210085, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.59960938, + "step": 3868, + "time_per_iteration": 3.053471088409424 + }, + { + "auxiliary_loss_clip": 0.01519212, + "auxiliary_loss_mlp": 0.01286662, + "balance_loss_clip": 1.152637, + "balance_loss_mlp": 1.02802515, + "epoch": 0.4652197438826429, + "flos": 23444615721600.0, + "grad_norm": 1.5101782965879438, + "language_loss": 0.83699894, + "learning_rate": 2.3210985682805593e-06, + "loss": 0.86505771, + "num_input_tokens_seen": 83231865, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.5859375, + "step": 3869, + "time_per_iteration": 3.096295118331909 + }, + { + "auxiliary_loss_clip": 0.01521891, + "auxiliary_loss_mlp": 0.01290982, + "balance_loss_clip": 1.15332246, + "balance_loss_mlp": 1.02795887, + "epoch": 0.46533998677328203, + "flos": 16218242974080.0, + "grad_norm": 2.3718410714484333, + "language_loss": 0.6782425, + "learning_rate": 2.320329677254007e-06, + "loss": 0.70637119, + "num_input_tokens_seen": 83249195, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 2.63085938, + "step": 3870, + "time_per_iteration": 3.049222230911255 + }, + { + "auxiliary_loss_clip": 0.01515551, + "auxiliary_loss_mlp": 0.0128441, + "balance_loss_clip": 1.14873981, + "balance_loss_mlp": 1.02481961, + "epoch": 0.46546022966392114, + "flos": 21143984368320.0, + "grad_norm": 2.3911677747583546, + "language_loss": 0.72484398, + "learning_rate": 2.319560737633697e-06, + "loss": 0.7528435, + "num_input_tokens_seen": 83267915, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.59570312, + "step": 3871, + "time_per_iteration": 3.0984835624694824 + }, + { + "auxiliary_loss_clip": 0.01512487, + "auxiliary_loss_mlp": 0.01289867, + "balance_loss_clip": 1.1462034, + "balance_loss_mlp": 1.02455485, + "epoch": 0.4655804725545602, + "flos": 41175145592640.0, + "grad_norm": 1.6141163115497932, + "language_loss": 0.68360192, + "learning_rate": 2.3187917495362775e-06, + "loss": 0.71162546, + "num_input_tokens_seen": 83292325, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.65429688, + "step": 3872, + "time_per_iteration": 3.2283449172973633 + }, + { + "auxiliary_loss_clip": 0.01515532, + "auxiliary_loss_mlp": 0.01285589, + "balance_loss_clip": 1.14853573, + "balance_loss_mlp": 1.02504539, + "epoch": 0.4657007154451993, + "flos": 19572870530880.0, + "grad_norm": 2.5486887058363266, + "language_loss": 0.7690388, + "learning_rate": 2.318022713078403e-06, + "loss": 0.79705, + "num_input_tokens_seen": 83306905, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 2.60546875, + "step": 3873, + "time_per_iteration": 3.1775565147399902 + }, + { + "auxiliary_loss_clip": 0.01510012, + "auxiliary_loss_mlp": 0.01282006, + "balance_loss_clip": 1.14204407, + "balance_loss_mlp": 1.02260613, + "epoch": 0.4658209583358384, + "flos": 15519333847680.0, + "grad_norm": 2.4218872376977316, + "language_loss": 0.8536799, + "learning_rate": 2.3172536283767354e-06, + "loss": 0.88160002, + "num_input_tokens_seen": 83320665, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.59375, + "step": 3874, + "time_per_iteration": 3.2075514793395996 + }, + { + "auxiliary_loss_clip": 0.01518841, + "auxiliary_loss_mlp": 0.01293176, + "balance_loss_clip": 1.15056705, + "balance_loss_mlp": 1.03072536, + "epoch": 0.4659412012264775, + "flos": 14904967318560.0, + "grad_norm": 2.0554582641644363, + "language_loss": 0.81114155, + "learning_rate": 2.3164844955479447e-06, + "loss": 0.83926177, + "num_input_tokens_seen": 83336475, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 2.625, + "step": 3875, + "time_per_iteration": 3.1632280349731445 + }, + { + "auxiliary_loss_clip": 0.01518529, + "auxiliary_loss_mlp": 0.01304121, + "balance_loss_clip": 1.15150881, + "balance_loss_mlp": 1.03995323, + "epoch": 0.4660614441171166, + "flos": 24427913106240.0, + "grad_norm": 1.6256163860587, + "language_loss": 0.70642436, + "learning_rate": 2.3157153147087082e-06, + "loss": 0.73465085, + "num_input_tokens_seen": 83358365, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.64257812, + "step": 3876, + "time_per_iteration": 3.078711986541748 + }, + { + "auxiliary_loss_clip": 0.01516641, + "auxiliary_loss_mlp": 0.01296206, + "balance_loss_clip": 1.1487149, + "balance_loss_mlp": 1.03470874, + "epoch": 0.46618168700775564, + "flos": 22093297757280.0, + "grad_norm": 1.9340482351675392, + "language_loss": 0.82771194, + "learning_rate": 2.314946085975709e-06, + "loss": 0.85584038, + "num_input_tokens_seen": 83377345, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 2.61523438, + "step": 3877, + "time_per_iteration": 3.119760274887085 + }, + { + "auxiliary_loss_clip": 0.01524319, + "auxiliary_loss_mlp": 0.01287121, + "balance_loss_clip": 1.15668678, + "balance_loss_mlp": 1.02600479, + "epoch": 0.46630192989839475, + "flos": 26179377166080.0, + "grad_norm": 1.8220414627105088, + "language_loss": 0.82035989, + "learning_rate": 2.3141768094656393e-06, + "loss": 0.84847432, + "num_input_tokens_seen": 83395920, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 2.61132812, + "step": 3878, + "time_per_iteration": 3.0192158222198486 + }, + { + "auxiliary_loss_clip": 0.01509605, + "auxiliary_loss_mlp": 0.01299706, + "balance_loss_clip": 1.14048922, + "balance_loss_mlp": 1.03858995, + "epoch": 0.46642217278903386, + "flos": 11511197536320.0, + "grad_norm": 2.6512204266144006, + "language_loss": 0.83227885, + "learning_rate": 2.3134074852951966e-06, + "loss": 0.86037201, + "num_input_tokens_seen": 83412510, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 2.61132812, + "step": 3879, + "time_per_iteration": 3.048004388809204 + }, + { + "auxiliary_loss_clip": 0.01516227, + "auxiliary_loss_mlp": 0.01286057, + "balance_loss_clip": 1.1498363, + "balance_loss_mlp": 1.02475047, + "epoch": 0.4665424156796729, + "flos": 32309577231840.0, + "grad_norm": 1.768659338571897, + "language_loss": 0.77812928, + "learning_rate": 2.312638113581088e-06, + "loss": 0.80615211, + "num_input_tokens_seen": 83432995, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.61328125, + "step": 3880, + "time_per_iteration": 3.1333115100860596 + }, + { + "auxiliary_loss_clip": 0.01508694, + "auxiliary_loss_mlp": 0.01290425, + "balance_loss_clip": 1.13925683, + "balance_loss_mlp": 1.0296905, + "epoch": 0.46666265857031203, + "flos": 18437821120800.0, + "grad_norm": 2.9562658733588214, + "language_loss": 0.78562438, + "learning_rate": 2.311868694440027e-06, + "loss": 0.81361562, + "num_input_tokens_seen": 83447415, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 2.60742188, + "step": 3881, + "time_per_iteration": 2.955287456512451 + }, + { + "auxiliary_loss_clip": 0.01529622, + "auxiliary_loss_mlp": 0.01295006, + "balance_loss_clip": 1.16666067, + "balance_loss_mlp": 1.07604218, + "epoch": 0.46678290146095114, + "flos": 68446447122240.0, + "grad_norm": 0.7646302454845726, + "language_loss": 0.62368828, + "learning_rate": 2.3110992279887323e-06, + "loss": 0.65193456, + "num_input_tokens_seen": 83519340, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.1953125, + "step": 3882, + "time_per_iteration": 3.6290435791015625 + }, + { + "auxiliary_loss_clip": 0.01512259, + "auxiliary_loss_mlp": 0.01285023, + "balance_loss_clip": 1.14353144, + "balance_loss_mlp": 1.02371597, + "epoch": 0.4669031443515902, + "flos": 17714447966880.0, + "grad_norm": 2.2983641916113235, + "language_loss": 0.84789217, + "learning_rate": 2.310329714343932e-06, + "loss": 0.87586492, + "num_input_tokens_seen": 83535490, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 2.61328125, + "step": 3883, + "time_per_iteration": 3.0465497970581055 + }, + { + "auxiliary_loss_clip": 0.01517654, + "auxiliary_loss_mlp": 0.01282291, + "balance_loss_clip": 1.15098953, + "balance_loss_mlp": 1.01945877, + "epoch": 0.4670233872422293, + "flos": 23949823913280.0, + "grad_norm": 2.1428364574310663, + "language_loss": 0.81865478, + "learning_rate": 2.309560153622361e-06, + "loss": 0.8466543, + "num_input_tokens_seen": 83552400, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.62890625, + "step": 3884, + "time_per_iteration": 3.0767126083374023 + }, + { + "auxiliary_loss_clip": 0.01515673, + "auxiliary_loss_mlp": 0.01290048, + "balance_loss_clip": 1.14814234, + "balance_loss_mlp": 1.02950406, + "epoch": 0.4671436301328684, + "flos": 28113808491360.0, + "grad_norm": 2.125907160313199, + "language_loss": 0.74951214, + "learning_rate": 2.3087905459407602e-06, + "loss": 0.77756935, + "num_input_tokens_seen": 83571340, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.60546875, + "step": 3885, + "time_per_iteration": 4.901641607284546 + }, + { + "auxiliary_loss_clip": 0.01522789, + "auxiliary_loss_mlp": 0.0121212, + "balance_loss_clip": 1.15970731, + "balance_loss_mlp": 0.99849701, + "epoch": 0.46726387302350747, + "flos": 69376189580640.0, + "grad_norm": 0.7936344963452818, + "language_loss": 0.62873483, + "learning_rate": 2.3080208914158795e-06, + "loss": 0.65608394, + "num_input_tokens_seen": 83634340, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.140625, + "step": 3886, + "time_per_iteration": 3.4945976734161377 + }, + { + "auxiliary_loss_clip": 0.01517577, + "auxiliary_loss_mlp": 0.01296497, + "balance_loss_clip": 1.15036631, + "balance_loss_mlp": 1.03423619, + "epoch": 0.4673841159141466, + "flos": 25521999739200.0, + "grad_norm": 2.354347463305994, + "language_loss": 0.72314018, + "learning_rate": 2.3072511901644753e-06, + "loss": 0.7512809, + "num_input_tokens_seen": 83653410, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.62304688, + "step": 3887, + "time_per_iteration": 3.0761969089508057 + }, + { + "auxiliary_loss_clip": 0.01514732, + "auxiliary_loss_mlp": 0.01279215, + "balance_loss_clip": 1.14741397, + "balance_loss_mlp": 1.02305794, + "epoch": 0.4675043588047857, + "flos": 24501608249760.0, + "grad_norm": 1.979060275752073, + "language_loss": 0.80889177, + "learning_rate": 2.306481442303309e-06, + "loss": 0.83683121, + "num_input_tokens_seen": 83672985, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.56054688, + "step": 3888, + "time_per_iteration": 3.0350911617279053 + }, + { + "auxiliary_loss_clip": 0.01505297, + "auxiliary_loss_mlp": 0.01288292, + "balance_loss_clip": 1.13750196, + "balance_loss_mlp": 1.02660334, + "epoch": 0.46762460169542475, + "flos": 20962951439040.0, + "grad_norm": 3.1729974660401923, + "language_loss": 0.73329407, + "learning_rate": 2.3057116479491515e-06, + "loss": 0.76122993, + "num_input_tokens_seen": 83692395, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.6171875, + "step": 3889, + "time_per_iteration": 3.8871750831604004 + }, + { + "auxiliary_loss_clip": 0.01510487, + "auxiliary_loss_mlp": 0.01281305, + "balance_loss_clip": 1.14250469, + "balance_loss_mlp": 1.02285957, + "epoch": 0.46774484458606386, + "flos": 19173672639360.0, + "grad_norm": 2.559343051964106, + "language_loss": 0.76273006, + "learning_rate": 2.30494180721878e-06, + "loss": 0.79064804, + "num_input_tokens_seen": 83709735, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 2.58398438, + "step": 3890, + "time_per_iteration": 3.107913017272949 + }, + { + "auxiliary_loss_clip": 0.01507052, + "auxiliary_loss_mlp": 0.01284175, + "balance_loss_clip": 1.13788819, + "balance_loss_mlp": 1.02572942, + "epoch": 0.4678650874767029, + "flos": 17969631177600.0, + "grad_norm": 1.961941530684866, + "language_loss": 0.89904284, + "learning_rate": 2.3041719202289794e-06, + "loss": 0.92695504, + "num_input_tokens_seen": 83725910, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 2.58398438, + "step": 3891, + "time_per_iteration": 3.0721490383148193 + }, + { + "auxiliary_loss_clip": 0.01515858, + "auxiliary_loss_mlp": 0.01284528, + "balance_loss_clip": 1.14894056, + "balance_loss_mlp": 1.02932477, + "epoch": 0.467985330367342, + "flos": 21362945821920.0, + "grad_norm": 1.7911217530887982, + "language_loss": 0.8087166, + "learning_rate": 2.30340198709654e-06, + "loss": 0.83672047, + "num_input_tokens_seen": 83745745, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.55078125, + "step": 3892, + "time_per_iteration": 3.835632562637329 + }, + { + "auxiliary_loss_clip": 0.01509747, + "auxiliary_loss_mlp": 0.01283812, + "balance_loss_clip": 1.14240217, + "balance_loss_mlp": 1.02498484, + "epoch": 0.46810557325798113, + "flos": 20523928615200.0, + "grad_norm": 2.1319669908789307, + "language_loss": 0.74752629, + "learning_rate": 2.3026320079382605e-06, + "loss": 0.77546191, + "num_input_tokens_seen": 83762680, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.58789062, + "step": 3893, + "time_per_iteration": 3.0492730140686035 + }, + { + "auxiliary_loss_clip": 0.01507886, + "auxiliary_loss_mlp": 0.01274209, + "balance_loss_clip": 1.14096165, + "balance_loss_mlp": 1.01595378, + "epoch": 0.4682258161486202, + "flos": 30120493690080.0, + "grad_norm": 3.520691002206835, + "language_loss": 0.76772743, + "learning_rate": 2.3018619828709454e-06, + "loss": 0.79554838, + "num_input_tokens_seen": 83784220, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.58203125, + "step": 3894, + "time_per_iteration": 3.092048168182373 + }, + { + "auxiliary_loss_clip": 0.01506682, + "auxiliary_loss_mlp": 0.01286673, + "balance_loss_clip": 1.1392051, + "balance_loss_mlp": 1.02822733, + "epoch": 0.4683460590392593, + "flos": 25295263012800.0, + "grad_norm": 1.9533880508935726, + "language_loss": 0.82094443, + "learning_rate": 2.3010919120114084e-06, + "loss": 0.84887803, + "num_input_tokens_seen": 83800750, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.58398438, + "step": 3895, + "time_per_iteration": 3.2888834476470947 + }, + { + "auxiliary_loss_clip": 0.01508551, + "auxiliary_loss_mlp": 0.01294011, + "balance_loss_clip": 1.1409235, + "balance_loss_mlp": 1.03117871, + "epoch": 0.4684663019298984, + "flos": 15369629942880.0, + "grad_norm": 12.025883354996237, + "language_loss": 0.66544974, + "learning_rate": 2.3003217954764672e-06, + "loss": 0.69347537, + "num_input_tokens_seen": 83815455, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.62890625, + "step": 3896, + "time_per_iteration": 3.1894030570983887 + }, + { + "auxiliary_loss_clip": 0.01505785, + "auxiliary_loss_mlp": 0.01286007, + "balance_loss_clip": 1.13798606, + "balance_loss_mlp": 1.02450907, + "epoch": 0.46858654482053747, + "flos": 27781137321120.0, + "grad_norm": 6.4036957572592135, + "language_loss": 0.79314959, + "learning_rate": 2.299551633382949e-06, + "loss": 0.82106757, + "num_input_tokens_seen": 83835765, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 2.61523438, + "step": 3897, + "time_per_iteration": 3.1049487590789795 + }, + { + "auxiliary_loss_clip": 0.01505766, + "auxiliary_loss_mlp": 0.01299482, + "balance_loss_clip": 1.13806617, + "balance_loss_mlp": 1.03970146, + "epoch": 0.4687067877111766, + "flos": 18042567757920.0, + "grad_norm": 2.1747994243618534, + "language_loss": 0.85584784, + "learning_rate": 2.2987814258476854e-06, + "loss": 0.88390028, + "num_input_tokens_seen": 83853565, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.59765625, + "step": 3898, + "time_per_iteration": 3.0449953079223633 + }, + { + "auxiliary_loss_clip": 0.01507269, + "auxiliary_loss_mlp": 0.01292373, + "balance_loss_clip": 1.14099801, + "balance_loss_mlp": 1.03068495, + "epoch": 0.4688270306018157, + "flos": 16978975729920.0, + "grad_norm": 4.475864871957184, + "language_loss": 0.67968178, + "learning_rate": 2.2980111729875177e-06, + "loss": 0.7076782, + "num_input_tokens_seen": 83869815, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.6171875, + "step": 3899, + "time_per_iteration": 2.9653565883636475 + }, + { + "auxiliary_loss_clip": 0.01504289, + "auxiliary_loss_mlp": 0.01288976, + "balance_loss_clip": 1.13681579, + "balance_loss_mlp": 1.02690625, + "epoch": 0.46894727349245474, + "flos": 17823568376160.0, + "grad_norm": 1.7932488448489106, + "language_loss": 0.82382369, + "learning_rate": 2.2972408749192917e-06, + "loss": 0.85175633, + "num_input_tokens_seen": 83887545, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.62109375, + "step": 3900, + "time_per_iteration": 3.132321357727051 + }, + { + "auxiliary_loss_clip": 0.01506485, + "auxiliary_loss_mlp": 0.01287868, + "balance_loss_clip": 1.14050925, + "balance_loss_mlp": 1.02904058, + "epoch": 0.46906751638309385, + "flos": 21473545429440.0, + "grad_norm": 1.94521436658371, + "language_loss": 0.67219543, + "learning_rate": 2.296470531759861e-06, + "loss": 0.70013893, + "num_input_tokens_seen": 83905645, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.58789062, + "step": 3901, + "time_per_iteration": 3.013108253479004 + }, + { + "auxiliary_loss_clip": 0.01505304, + "auxiliary_loss_mlp": 0.01283713, + "balance_loss_clip": 1.1391865, + "balance_loss_mlp": 1.01897323, + "epoch": 0.46918775927373296, + "flos": 20339823504960.0, + "grad_norm": 2.4207947855256866, + "language_loss": 0.80241656, + "learning_rate": 2.2957001436260866e-06, + "loss": 0.83030665, + "num_input_tokens_seen": 83922705, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.6484375, + "step": 3902, + "time_per_iteration": 3.0466582775115967 + }, + { + "auxiliary_loss_clip": 0.01510577, + "auxiliary_loss_mlp": 0.0128752, + "balance_loss_clip": 1.14269376, + "balance_loss_mlp": 1.02773941, + "epoch": 0.469308002164372, + "flos": 18405088754400.0, + "grad_norm": 1.6695696505430297, + "language_loss": 0.7338137, + "learning_rate": 2.294929710634836e-06, + "loss": 0.76179469, + "num_input_tokens_seen": 83940795, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 2.59765625, + "step": 3903, + "time_per_iteration": 3.008965253829956 + }, + { + "auxiliary_loss_clip": 0.01503464, + "auxiliary_loss_mlp": 0.01282966, + "balance_loss_clip": 1.13583517, + "balance_loss_mlp": 1.02146792, + "epoch": 0.46942824505501113, + "flos": 37965139567200.0, + "grad_norm": 1.7986782968121386, + "language_loss": 0.61483526, + "learning_rate": 2.2941592329029823e-06, + "loss": 0.6426996, + "num_input_tokens_seen": 83961900, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.61523438, + "step": 3904, + "time_per_iteration": 3.0933406352996826 + }, + { + "auxiliary_loss_clip": 0.01509927, + "auxiliary_loss_mlp": 0.01284549, + "balance_loss_clip": 1.14271307, + "balance_loss_mlp": 1.02419591, + "epoch": 0.46954848794565024, + "flos": 21874715585280.0, + "grad_norm": 2.0677060033177175, + "language_loss": 0.79186696, + "learning_rate": 2.2933887105474067e-06, + "loss": 0.8198117, + "num_input_tokens_seen": 83980075, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.60351562, + "step": 3905, + "time_per_iteration": 3.0324318408966064 + }, + { + "auxiliary_loss_clip": 0.01502143, + "auxiliary_loss_mlp": 0.01280461, + "balance_loss_clip": 1.13602066, + "balance_loss_mlp": 1.02239728, + "epoch": 0.4696687308362893, + "flos": 22018654409760.0, + "grad_norm": 1.6677373788693368, + "language_loss": 0.81301928, + "learning_rate": 2.2926181436849974e-06, + "loss": 0.84084523, + "num_input_tokens_seen": 83999430, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.58007812, + "step": 3906, + "time_per_iteration": 3.064124345779419 + }, + { + "auxiliary_loss_clip": 0.01508825, + "auxiliary_loss_mlp": 0.01293925, + "balance_loss_clip": 1.1421144, + "balance_loss_mlp": 1.03280902, + "epoch": 0.4697889737269284, + "flos": 21615663702240.0, + "grad_norm": 1.6885072668592125, + "language_loss": 0.72813582, + "learning_rate": 2.2918475324326478e-06, + "loss": 0.75616336, + "num_input_tokens_seen": 84019150, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.61132812, + "step": 3907, + "time_per_iteration": 3.0278825759887695 + }, + { + "auxiliary_loss_clip": 0.01513208, + "auxiliary_loss_mlp": 0.01303065, + "balance_loss_clip": 1.14735699, + "balance_loss_mlp": 1.04042363, + "epoch": 0.46990921661756746, + "flos": 25230670627680.0, + "grad_norm": 3.46950308714962, + "language_loss": 0.91288614, + "learning_rate": 2.2910768769072603e-06, + "loss": 0.94104886, + "num_input_tokens_seen": 84037930, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.62695312, + "step": 3908, + "time_per_iteration": 3.1337337493896484 + }, + { + "auxiliary_loss_clip": 0.01508655, + "auxiliary_loss_mlp": 0.01284711, + "balance_loss_clip": 1.14218998, + "balance_loss_mlp": 1.02664685, + "epoch": 0.47002945950820657, + "flos": 13845699100800.0, + "grad_norm": 2.152818858377195, + "language_loss": 0.76361322, + "learning_rate": 2.2903061772257417e-06, + "loss": 0.79154694, + "num_input_tokens_seen": 84055915, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.58007812, + "step": 3909, + "time_per_iteration": 3.028749704360962 + }, + { + "auxiliary_loss_clip": 0.01500854, + "auxiliary_loss_mlp": 0.01285535, + "balance_loss_clip": 1.13222694, + "balance_loss_mlp": 1.02461016, + "epoch": 0.4701497023988457, + "flos": 26249241565440.0, + "grad_norm": 1.45466503138573, + "language_loss": 0.78484815, + "learning_rate": 2.289535433505007e-06, + "loss": 0.81271207, + "num_input_tokens_seen": 84077270, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 2.609375, + "step": 3910, + "time_per_iteration": 3.148559331893921 + }, + { + "auxiliary_loss_clip": 0.01503231, + "auxiliary_loss_mlp": 0.01293159, + "balance_loss_clip": 1.13479137, + "balance_loss_mlp": 1.03128052, + "epoch": 0.47026994528948474, + "flos": 25631878711680.0, + "grad_norm": 2.322900526561026, + "language_loss": 0.63486481, + "learning_rate": 2.2887646458619767e-06, + "loss": 0.66282868, + "num_input_tokens_seen": 84098635, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 2.61914062, + "step": 3911, + "time_per_iteration": 3.1260313987731934 + }, + { + "auxiliary_loss_clip": 0.01511318, + "auxiliary_loss_mlp": 0.01311871, + "balance_loss_clip": 1.1421963, + "balance_loss_mlp": 1.04999232, + "epoch": 0.47039018818012385, + "flos": 20556281700000.0, + "grad_norm": 2.2538739931130327, + "language_loss": 0.76601028, + "learning_rate": 2.2879938144135797e-06, + "loss": 0.79424214, + "num_input_tokens_seen": 84114740, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 2.61914062, + "step": 3912, + "time_per_iteration": 3.8656728267669678 + }, + { + "auxiliary_loss_clip": 0.01500442, + "auxiliary_loss_mlp": 0.01288109, + "balance_loss_clip": 1.13376999, + "balance_loss_mlp": 1.02966309, + "epoch": 0.47051043107076296, + "flos": 21579252304320.0, + "grad_norm": 1.9899646403490237, + "language_loss": 0.75388324, + "learning_rate": 2.2872229392767496e-06, + "loss": 0.7817688, + "num_input_tokens_seen": 84134845, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.58398438, + "step": 3913, + "time_per_iteration": 4.033597230911255 + }, + { + "auxiliary_loss_clip": 0.01502541, + "auxiliary_loss_mlp": 0.01298587, + "balance_loss_clip": 1.13472128, + "balance_loss_mlp": 1.03956866, + "epoch": 0.470630673961402, + "flos": 18955204251840.0, + "grad_norm": 1.552875569924931, + "language_loss": 0.74789566, + "learning_rate": 2.286452020568428e-06, + "loss": 0.77590692, + "num_input_tokens_seen": 84152920, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 2.58984375, + "step": 3914, + "time_per_iteration": 3.1550631523132324 + }, + { + "auxiliary_loss_clip": 0.01502435, + "auxiliary_loss_mlp": 0.01300852, + "balance_loss_clip": 1.13638496, + "balance_loss_mlp": 1.03706598, + "epoch": 0.4707509168520411, + "flos": 19941080751360.0, + "grad_norm": 2.153782487592536, + "language_loss": 0.73344183, + "learning_rate": 2.2856810584055637e-06, + "loss": 0.76147461, + "num_input_tokens_seen": 84170455, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.63867188, + "step": 3915, + "time_per_iteration": 3.0459041595458984 + }, + { + "auxiliary_loss_clip": 0.01498071, + "auxiliary_loss_mlp": 0.01282652, + "balance_loss_clip": 1.13139701, + "balance_loss_mlp": 1.02248931, + "epoch": 0.47087115974268023, + "flos": 40122173449440.0, + "grad_norm": 2.434406452751416, + "language_loss": 0.67784464, + "learning_rate": 2.2849100529051085e-06, + "loss": 0.70565182, + "num_input_tokens_seen": 84197390, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.6015625, + "step": 3916, + "time_per_iteration": 3.9801435470581055 + }, + { + "auxiliary_loss_clip": 0.01505691, + "auxiliary_loss_mlp": 0.01281166, + "balance_loss_clip": 1.13863969, + "balance_loss_mlp": 1.02348292, + "epoch": 0.4709914026333193, + "flos": 13554407917440.0, + "grad_norm": 2.5131630396965092, + "language_loss": 0.79805654, + "learning_rate": 2.284139004184026e-06, + "loss": 0.82592511, + "num_input_tokens_seen": 84214620, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.57617188, + "step": 3917, + "time_per_iteration": 3.0132861137390137 + }, + { + "auxiliary_loss_clip": 0.01506949, + "auxiliary_loss_mlp": 0.01283681, + "balance_loss_clip": 1.13996434, + "balance_loss_mlp": 1.02313733, + "epoch": 0.4711116455239584, + "flos": 19976695657920.0, + "grad_norm": 3.672010026469039, + "language_loss": 0.74189627, + "learning_rate": 2.2833679123592814e-06, + "loss": 0.76980251, + "num_input_tokens_seen": 84231880, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.60546875, + "step": 3918, + "time_per_iteration": 3.0313920974731445 + }, + { + "auxiliary_loss_clip": 0.01499732, + "auxiliary_loss_mlp": 0.01279997, + "balance_loss_clip": 1.13216519, + "balance_loss_mlp": 1.02021635, + "epoch": 0.4712318884145975, + "flos": 32127596098560.0, + "grad_norm": 1.7547038088995361, + "language_loss": 0.63415182, + "learning_rate": 2.2825967775478508e-06, + "loss": 0.6619491, + "num_input_tokens_seen": 84252980, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 2.59765625, + "step": 3919, + "time_per_iteration": 3.906749963760376 + }, + { + "auxiliary_loss_clip": 0.01504214, + "auxiliary_loss_mlp": 0.01285279, + "balance_loss_clip": 1.13741851, + "balance_loss_mlp": 1.02664256, + "epoch": 0.47135213130523657, + "flos": 20049935663520.0, + "grad_norm": 2.2233790351218756, + "language_loss": 0.83807504, + "learning_rate": 2.2818255998667135e-06, + "loss": 0.86596996, + "num_input_tokens_seen": 84271490, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.5859375, + "step": 3920, + "time_per_iteration": 2.956233024597168 + }, + { + "auxiliary_loss_clip": 0.01511092, + "auxiliary_loss_mlp": 0.01287841, + "balance_loss_clip": 1.14381623, + "balance_loss_mlp": 1.02939534, + "epoch": 0.4714723741958757, + "flos": 19429007562720.0, + "grad_norm": 1.6636681197250043, + "language_loss": 0.79282922, + "learning_rate": 2.2810543794328566e-06, + "loss": 0.8208186, + "num_input_tokens_seen": 84290525, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 2.58398438, + "step": 3921, + "time_per_iteration": 2.969886541366577 + }, + { + "auxiliary_loss_clip": 0.01507166, + "auxiliary_loss_mlp": 0.01289275, + "balance_loss_clip": 1.13895249, + "balance_loss_mlp": 1.02873158, + "epoch": 0.4715926170865148, + "flos": 20375855621280.0, + "grad_norm": 1.6866204690438145, + "language_loss": 0.82487786, + "learning_rate": 2.2802831163632735e-06, + "loss": 0.85284233, + "num_input_tokens_seen": 84309245, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 2.60546875, + "step": 3922, + "time_per_iteration": 2.973992109298706 + }, + { + "auxiliary_loss_clip": 0.01502841, + "auxiliary_loss_mlp": 0.01287201, + "balance_loss_clip": 1.13500905, + "balance_loss_mlp": 1.02703857, + "epoch": 0.47171285997715384, + "flos": 22675007776320.0, + "grad_norm": 1.727590827725257, + "language_loss": 0.74637461, + "learning_rate": 2.279511810774965e-06, + "loss": 0.774275, + "num_input_tokens_seen": 84330775, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.6015625, + "step": 3923, + "time_per_iteration": 3.013118267059326 + }, + { + "auxiliary_loss_clip": 0.01501618, + "auxiliary_loss_mlp": 0.01281412, + "balance_loss_clip": 1.13303542, + "balance_loss_mlp": 1.02182198, + "epoch": 0.47183310286779295, + "flos": 21107535042240.0, + "grad_norm": 1.9753509854318323, + "language_loss": 0.71545088, + "learning_rate": 2.2787404627849364e-06, + "loss": 0.74328119, + "num_input_tokens_seen": 84349985, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 2.59570312, + "step": 3924, + "time_per_iteration": 2.941995620727539 + }, + { + "auxiliary_loss_clip": 0.01505055, + "auxiliary_loss_mlp": 0.01278506, + "balance_loss_clip": 1.13708889, + "balance_loss_mlp": 1.02101433, + "epoch": 0.471953345758432, + "flos": 21728766568320.0, + "grad_norm": 1.908626753160794, + "language_loss": 0.79064846, + "learning_rate": 2.277969072510202e-06, + "loss": 0.81848407, + "num_input_tokens_seen": 84368965, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.57421875, + "step": 3925, + "time_per_iteration": 3.2132527828216553 + }, + { + "auxiliary_loss_clip": 0.01500226, + "auxiliary_loss_mlp": 0.01282411, + "balance_loss_clip": 1.13096917, + "balance_loss_mlp": 1.02453768, + "epoch": 0.4720735886490711, + "flos": 19862910084960.0, + "grad_norm": 1.60441224100223, + "language_loss": 0.81499135, + "learning_rate": 2.2771976400677803e-06, + "loss": 0.84281778, + "num_input_tokens_seen": 84387795, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 2.578125, + "step": 3926, + "time_per_iteration": 2.954167366027832 + }, + { + "auxiliary_loss_clip": 0.01503596, + "auxiliary_loss_mlp": 0.01277909, + "balance_loss_clip": 1.13354337, + "balance_loss_mlp": 1.02289701, + "epoch": 0.47219383153971023, + "flos": 19173824352000.0, + "grad_norm": 1.7735461285040528, + "language_loss": 0.7949214, + "learning_rate": 2.2764261655746965e-06, + "loss": 0.82273638, + "num_input_tokens_seen": 84405290, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 2.54882812, + "step": 3927, + "time_per_iteration": 2.9457857608795166 + }, + { + "auxiliary_loss_clip": 0.01503896, + "auxiliary_loss_mlp": 0.01284264, + "balance_loss_clip": 1.13514221, + "balance_loss_mlp": 1.02429247, + "epoch": 0.4723140744303493, + "flos": 23226374903040.0, + "grad_norm": 2.1929370068587306, + "language_loss": 0.76189053, + "learning_rate": 2.2756546491479832e-06, + "loss": 0.78977209, + "num_input_tokens_seen": 84426205, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 2.59960938, + "step": 3928, + "time_per_iteration": 2.97806715965271 + }, + { + "auxiliary_loss_clip": 0.01505472, + "auxiliary_loss_mlp": 0.01288349, + "balance_loss_clip": 1.13700199, + "balance_loss_mlp": 1.02856839, + "epoch": 0.4724343173209884, + "flos": 18225231598080.0, + "grad_norm": 2.5463636569328463, + "language_loss": 0.80159259, + "learning_rate": 2.274883090904679e-06, + "loss": 0.82953078, + "num_input_tokens_seen": 84443970, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 2.59765625, + "step": 3929, + "time_per_iteration": 2.9781832695007324 + }, + { + "auxiliary_loss_clip": 0.01510596, + "auxiliary_loss_mlp": 0.01296694, + "balance_loss_clip": 1.14195347, + "balance_loss_mlp": 1.03653193, + "epoch": 0.4725545602116275, + "flos": 21253142705760.0, + "grad_norm": 3.9759986306349213, + "language_loss": 0.67893064, + "learning_rate": 2.2741114909618283e-06, + "loss": 0.70700347, + "num_input_tokens_seen": 84459865, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 2.6015625, + "step": 3930, + "time_per_iteration": 3.092489242553711 + }, + { + "auxiliary_loss_clip": 0.01504282, + "auxiliary_loss_mlp": 0.012823, + "balance_loss_clip": 1.13362551, + "balance_loss_mlp": 1.02499855, + "epoch": 0.47267480310226656, + "flos": 21436147899360.0, + "grad_norm": 1.7854368158673377, + "language_loss": 0.72014171, + "learning_rate": 2.2733398494364828e-06, + "loss": 0.74800754, + "num_input_tokens_seen": 84479110, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 2.57226562, + "step": 3931, + "time_per_iteration": 3.1094233989715576 + }, + { + "auxiliary_loss_clip": 0.01504071, + "auxiliary_loss_mlp": 0.01288253, + "balance_loss_clip": 1.13532197, + "balance_loss_mlp": 1.02923548, + "epoch": 0.47279504599290567, + "flos": 18772502483520.0, + "grad_norm": 2.05158345017986, + "language_loss": 0.84770143, + "learning_rate": 2.272568166445699e-06, + "loss": 0.87562466, + "num_input_tokens_seen": 84497675, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 2.58984375, + "step": 3932, + "time_per_iteration": 3.14249849319458 + }, + { + "auxiliary_loss_clip": 0.01503499, + "auxiliary_loss_mlp": 0.01284688, + "balance_loss_clip": 1.13361096, + "balance_loss_mlp": 1.0254792, + "epoch": 0.4729152888835448, + "flos": 21107876395680.0, + "grad_norm": 2.6602264344392226, + "language_loss": 0.64892519, + "learning_rate": 2.271796442106541e-06, + "loss": 0.67680705, + "num_input_tokens_seen": 84517030, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 2.59179688, + "step": 3933, + "time_per_iteration": 3.0779001712799072 + }, + { + "auxiliary_loss_clip": 0.01553851, + "auxiliary_loss_mlp": 0.01206268, + "balance_loss_clip": 1.18847322, + "balance_loss_mlp": 0.99645996, + "epoch": 0.47303553177418384, + "flos": 70208038365120.0, + "grad_norm": 0.810312853890501, + "language_loss": 0.5645951, + "learning_rate": 2.271024676536079e-06, + "loss": 0.59219635, + "num_input_tokens_seen": 84577290, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.1015625, + "step": 3934, + "time_per_iteration": 3.434906244277954 + }, + { + "auxiliary_loss_clip": 0.01510176, + "auxiliary_loss_mlp": 0.01304115, + "balance_loss_clip": 1.14112377, + "balance_loss_mlp": 1.03823066, + "epoch": 0.47315577466482295, + "flos": 22457297952000.0, + "grad_norm": 2.389741703679794, + "language_loss": 0.73525566, + "learning_rate": 2.2702528698513894e-06, + "loss": 0.76339853, + "num_input_tokens_seen": 84598415, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 2.66015625, + "step": 3935, + "time_per_iteration": 3.0900771617889404 + }, + { + "auxiliary_loss_clip": 0.01506746, + "auxiliary_loss_mlp": 0.01288872, + "balance_loss_clip": 1.13705814, + "balance_loss_mlp": 1.02661133, + "epoch": 0.47327601755546206, + "flos": 24355166166720.0, + "grad_norm": 1.8184237264984062, + "language_loss": 0.78702772, + "learning_rate": 2.269481022169554e-06, + "loss": 0.8149839, + "num_input_tokens_seen": 84617010, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 2.62304688, + "step": 3936, + "time_per_iteration": 3.0558063983917236 + }, + { + "auxiliary_loss_clip": 0.01505822, + "auxiliary_loss_mlp": 0.01289997, + "balance_loss_clip": 1.1345427, + "balance_loss_mlp": 1.03021622, + "epoch": 0.4733962604461011, + "flos": 22928825573280.0, + "grad_norm": 1.8103717027046669, + "language_loss": 0.80918515, + "learning_rate": 2.2687091336076614e-06, + "loss": 0.83714336, + "num_input_tokens_seen": 84636350, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 2.59765625, + "step": 3937, + "time_per_iteration": 3.0803422927856445 + }, + { + "auxiliary_loss_clip": 0.0150844, + "auxiliary_loss_mlp": 0.01284335, + "balance_loss_clip": 1.13896585, + "balance_loss_mlp": 1.02398181, + "epoch": 0.4735165033367402, + "flos": 18329042064960.0, + "grad_norm": 1.8366577118600471, + "language_loss": 0.80419552, + "learning_rate": 2.267937204282807e-06, + "loss": 0.83212322, + "num_input_tokens_seen": 84653490, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 2.60351562, + "step": 3938, + "time_per_iteration": 3.0816893577575684 + }, + { + "auxiliary_loss_clip": 0.01510934, + "auxiliary_loss_mlp": 0.01298101, + "balance_loss_clip": 1.14121222, + "balance_loss_mlp": 1.0346961, + "epoch": 0.4736367462273793, + "flos": 23039121755520.0, + "grad_norm": 2.289189420101863, + "language_loss": 0.79103345, + "learning_rate": 2.2671652343120926e-06, + "loss": 0.8191238, + "num_input_tokens_seen": 84673965, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 2.63476562, + "step": 3939, + "time_per_iteration": 3.053133487701416 + }, + { + "auxiliary_loss_clip": 0.01511588, + "auxiliary_loss_mlp": 0.01293692, + "balance_loss_clip": 1.13963687, + "balance_loss_mlp": 1.03162265, + "epoch": 0.4737569891180184, + "flos": 25376695500960.0, + "grad_norm": 1.7234075834549751, + "language_loss": 0.80736923, + "learning_rate": 2.2663932238126236e-06, + "loss": 0.8354221, + "num_input_tokens_seen": 84692525, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 2.62109375, + "step": 3940, + "time_per_iteration": 4.793766975402832 + }, + { + "auxiliary_loss_clip": 0.01507663, + "auxiliary_loss_mlp": 0.0128666, + "balance_loss_clip": 1.13703728, + "balance_loss_mlp": 1.02745128, + "epoch": 0.4738772320086575, + "flos": 25851750441120.0, + "grad_norm": 1.5578461863407715, + "language_loss": 0.8032397, + "learning_rate": 2.265621172901515e-06, + "loss": 0.83118296, + "num_input_tokens_seen": 84715640, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 2.59179688, + "step": 3941, + "time_per_iteration": 3.06965708732605 + }, + { + "auxiliary_loss_clip": 0.0151507, + "auxiliary_loss_mlp": 0.01299107, + "balance_loss_clip": 1.14452505, + "balance_loss_mlp": 1.03799081, + "epoch": 0.47399747489929656, + "flos": 27566954815680.0, + "grad_norm": 2.184166147378192, + "language_loss": 0.7128548, + "learning_rate": 2.2648490816958854e-06, + "loss": 0.7409966, + "num_input_tokens_seen": 84736635, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 2.61132812, + "step": 3942, + "time_per_iteration": 3.0396153926849365 + }, + { + "auxiliary_loss_clip": 0.01504094, + "auxiliary_loss_mlp": 0.01296971, + "balance_loss_clip": 1.13328564, + "balance_loss_mlp": 1.03451991, + "epoch": 0.47411771778993567, + "flos": 24865987726080.0, + "grad_norm": 3.0270939179392986, + "language_loss": 0.73210043, + "learning_rate": 2.264076950312861e-06, + "loss": 0.76011109, + "num_input_tokens_seen": 84755445, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 2.625, + "step": 3943, + "time_per_iteration": 3.0239081382751465 + }, + { + "auxiliary_loss_clip": 0.01503826, + "auxiliary_loss_mlp": 0.01296259, + "balance_loss_clip": 1.13470125, + "balance_loss_mlp": 1.03495193, + "epoch": 0.4742379606805748, + "flos": 22750257974400.0, + "grad_norm": 2.9193805760371565, + "language_loss": 0.82178915, + "learning_rate": 2.2633047788695727e-06, + "loss": 0.84978998, + "num_input_tokens_seen": 84775750, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 2.61328125, + "step": 3944, + "time_per_iteration": 3.846911668777466 + }, + { + "auxiliary_loss_clip": 0.01513413, + "auxiliary_loss_mlp": 0.01285272, + "balance_loss_clip": 1.14353728, + "balance_loss_mlp": 1.02453732, + "epoch": 0.47435820357121383, + "flos": 19683090856800.0, + "grad_norm": 1.766285031603838, + "language_loss": 0.64341092, + "learning_rate": 2.262532567483159e-06, + "loss": 0.67139781, + "num_input_tokens_seen": 84794310, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 2.60742188, + "step": 3945, + "time_per_iteration": 2.9863216876983643 + }, + { + "auxiliary_loss_clip": 0.01519186, + "auxiliary_loss_mlp": 0.01291895, + "balance_loss_clip": 1.14933872, + "balance_loss_mlp": 1.02982569, + "epoch": 0.47447844646185294, + "flos": 25231087837440.0, + "grad_norm": 3.028834519815306, + "language_loss": 0.80330002, + "learning_rate": 2.2617603162707635e-06, + "loss": 0.83141088, + "num_input_tokens_seen": 84814720, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 2.62109375, + "step": 3946, + "time_per_iteration": 2.957836151123047 + }, + { + "auxiliary_loss_clip": 0.01508307, + "auxiliary_loss_mlp": 0.01281126, + "balance_loss_clip": 1.13823497, + "balance_loss_mlp": 1.02039111, + "epoch": 0.47459868935249205, + "flos": 24572875991040.0, + "grad_norm": 2.0376467629234107, + "language_loss": 0.82849538, + "learning_rate": 2.2609880253495363e-06, + "loss": 0.85638964, + "num_input_tokens_seen": 84834355, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 2.60742188, + "step": 3947, + "time_per_iteration": 3.8565773963928223 + }, + { + "auxiliary_loss_clip": 0.01508503, + "auxiliary_loss_mlp": 0.01280527, + "balance_loss_clip": 1.13708532, + "balance_loss_mlp": 1.0197922, + "epoch": 0.4747189322431311, + "flos": 20560605510240.0, + "grad_norm": 2.993082545998822, + "language_loss": 0.86732066, + "learning_rate": 2.260215694836633e-06, + "loss": 0.89521098, + "num_input_tokens_seen": 84853530, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 2.60742188, + "step": 3948, + "time_per_iteration": 3.1018338203430176 + }, + { + "auxiliary_loss_clip": 0.01509992, + "auxiliary_loss_mlp": 0.01288324, + "balance_loss_clip": 1.13729119, + "balance_loss_mlp": 1.028162, + "epoch": 0.4748391751337702, + "flos": 25997661529920.0, + "grad_norm": 3.032715608147361, + "language_loss": 0.6480161, + "learning_rate": 2.2594433248492157e-06, + "loss": 0.67599928, + "num_input_tokens_seen": 84872505, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 2.6015625, + "step": 3949, + "time_per_iteration": 3.0036849975585938 + }, + { + "auxiliary_loss_clip": 0.01508084, + "auxiliary_loss_mlp": 0.01286069, + "balance_loss_clip": 1.13869023, + "balance_loss_mlp": 1.02800453, + "epoch": 0.47495941802440933, + "flos": 22823308339200.0, + "grad_norm": 1.8517422045595573, + "language_loss": 0.80573505, + "learning_rate": 2.2586709155044527e-06, + "loss": 0.83367658, + "num_input_tokens_seen": 84893105, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 2.58007812, + "step": 3950, + "time_per_iteration": 3.0382814407348633 + }, + { + "auxiliary_loss_clip": 0.01507475, + "auxiliary_loss_mlp": 0.01284379, + "balance_loss_clip": 1.13813615, + "balance_loss_mlp": 1.02478909, + "epoch": 0.4750796609150484, + "flos": 27894278115360.0, + "grad_norm": 2.279613558544793, + "language_loss": 0.76061928, + "learning_rate": 2.2578984669195167e-06, + "loss": 0.7885378, + "num_input_tokens_seen": 84914070, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 2.59570312, + "step": 3951, + "time_per_iteration": 3.025747537612915 + }, + { + "auxiliary_loss_clip": 0.01505225, + "auxiliary_loss_mlp": 0.01277381, + "balance_loss_clip": 1.13353193, + "balance_loss_mlp": 1.02007937, + "epoch": 0.4751999038056875, + "flos": 35662839374880.0, + "grad_norm": 2.14163436838376, + "language_loss": 0.67709768, + "learning_rate": 2.2571259792115887e-06, + "loss": 0.70492369, + "num_input_tokens_seen": 84935290, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 2.57226562, + "step": 3952, + "time_per_iteration": 3.180023670196533 + }, + { + "auxiliary_loss_clip": 0.01513785, + "auxiliary_loss_mlp": 0.01279848, + "balance_loss_clip": 1.14169455, + "balance_loss_mlp": 1.02331018, + "epoch": 0.4753201466963266, + "flos": 22093070188320.0, + "grad_norm": 2.351526060076209, + "language_loss": 0.79674387, + "learning_rate": 2.2563534524978544e-06, + "loss": 0.82468021, + "num_input_tokens_seen": 84952760, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 2.56445312, + "step": 3953, + "time_per_iteration": 3.0468945503234863 + }, + { + "auxiliary_loss_clip": 0.01512073, + "auxiliary_loss_mlp": 0.01278957, + "balance_loss_clip": 1.14118409, + "balance_loss_mlp": 1.02051163, + "epoch": 0.47544038958696566, + "flos": 30193506126720.0, + "grad_norm": 1.8754212242951525, + "language_loss": 0.70947242, + "learning_rate": 2.2555808868955052e-06, + "loss": 0.73738271, + "num_input_tokens_seen": 84974890, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 2.58398438, + "step": 3954, + "time_per_iteration": 3.07437801361084 + }, + { + "auxiliary_loss_clip": 0.01508149, + "auxiliary_loss_mlp": 0.01297864, + "balance_loss_clip": 1.13876104, + "balance_loss_mlp": 1.03426862, + "epoch": 0.47556063247760477, + "flos": 23474200050720.0, + "grad_norm": 2.5411696668772783, + "language_loss": 0.7393046, + "learning_rate": 2.254808282521738e-06, + "loss": 0.76736474, + "num_input_tokens_seen": 84993640, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 2.63671875, + "step": 3955, + "time_per_iteration": 3.036447763442993 + }, + { + "auxiliary_loss_clip": 0.01514431, + "auxiliary_loss_mlp": 0.01295233, + "balance_loss_clip": 1.14356446, + "balance_loss_mlp": 1.03793204, + "epoch": 0.4756808753682438, + "flos": 25157544406560.0, + "grad_norm": 2.063368240424429, + "language_loss": 0.80789834, + "learning_rate": 2.2540356394937573e-06, + "loss": 0.83599502, + "num_input_tokens_seen": 85012340, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 2.57226562, + "step": 3956, + "time_per_iteration": 3.0481035709381104 + }, + { + "auxiliary_loss_clip": 0.01512414, + "auxiliary_loss_mlp": 0.01294565, + "balance_loss_clip": 1.14065242, + "balance_loss_mlp": 1.03383017, + "epoch": 0.47580111825888294, + "flos": 15671578939200.0, + "grad_norm": 2.6310391986995736, + "language_loss": 0.84086698, + "learning_rate": 2.253262957928772e-06, + "loss": 0.86893678, + "num_input_tokens_seen": 85029225, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 2.60742188, + "step": 3957, + "time_per_iteration": 2.953979253768921 + }, + { + "auxiliary_loss_clip": 0.0150832, + "auxiliary_loss_mlp": 0.01293347, + "balance_loss_clip": 1.13669574, + "balance_loss_mlp": 1.03184915, + "epoch": 0.47592136114952205, + "flos": 17638325421120.0, + "grad_norm": 2.178821507725128, + "language_loss": 0.72171837, + "learning_rate": 2.2524902379439976e-06, + "loss": 0.749735, + "num_input_tokens_seen": 85047895, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 2.61523438, + "step": 3958, + "time_per_iteration": 3.004363536834717 + }, + { + "auxiliary_loss_clip": 0.01563213, + "auxiliary_loss_mlp": 0.01201385, + "balance_loss_clip": 1.19639421, + "balance_loss_mlp": 0.99157715, + "epoch": 0.4760416040401611, + "flos": 61423295641920.0, + "grad_norm": 0.7401599774299171, + "language_loss": 0.6370976, + "learning_rate": 2.251717479656655e-06, + "loss": 0.66474354, + "num_input_tokens_seen": 85112690, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.1015625, + "step": 3959, + "time_per_iteration": 3.5366320610046387 + }, + { + "auxiliary_loss_clip": 0.01510689, + "auxiliary_loss_mlp": 0.01290947, + "balance_loss_clip": 1.14051342, + "balance_loss_mlp": 1.03002119, + "epoch": 0.4761618469308002, + "flos": 18407857510080.0, + "grad_norm": 2.3319677862255612, + "language_loss": 0.76388192, + "learning_rate": 2.2509446831839704e-06, + "loss": 0.79189831, + "num_input_tokens_seen": 85132130, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 2.609375, + "step": 3960, + "time_per_iteration": 3.0365312099456787 + }, + { + "auxiliary_loss_clip": 0.01512075, + "auxiliary_loss_mlp": 0.01285009, + "balance_loss_clip": 1.14219642, + "balance_loss_mlp": 1.02484703, + "epoch": 0.4762820898214393, + "flos": 18042871183200.0, + "grad_norm": 2.2515992862774543, + "language_loss": 0.82603478, + "learning_rate": 2.250171848643177e-06, + "loss": 0.85400569, + "num_input_tokens_seen": 85149420, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 2.6015625, + "step": 3961, + "time_per_iteration": 3.0725882053375244 + }, + { + "auxiliary_loss_clip": 0.01510036, + "auxiliary_loss_mlp": 0.0129815, + "balance_loss_clip": 1.13848269, + "balance_loss_mlp": 1.04103971, + "epoch": 0.4764023327120784, + "flos": 19320304363200.0, + "grad_norm": 1.9584285447634209, + "language_loss": 0.86166126, + "learning_rate": 2.249398976151513e-06, + "loss": 0.88974315, + "num_input_tokens_seen": 85166970, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 2.5703125, + "step": 3962, + "time_per_iteration": 2.9993467330932617 + }, + { + "auxiliary_loss_clip": 0.0150856, + "auxiliary_loss_mlp": 0.0128154, + "balance_loss_clip": 1.1373409, + "balance_loss_mlp": 1.02290368, + "epoch": 0.4765225756027175, + "flos": 22749461483040.0, + "grad_norm": 2.115535504536876, + "language_loss": 0.78606951, + "learning_rate": 2.248626065826223e-06, + "loss": 0.81397051, + "num_input_tokens_seen": 85185175, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 2.5859375, + "step": 3963, + "time_per_iteration": 3.0298755168914795 + }, + { + "auxiliary_loss_clip": 0.01545418, + "auxiliary_loss_mlp": 0.01217896, + "balance_loss_clip": 1.18023813, + "balance_loss_mlp": 1.00808716, + "epoch": 0.4766428184933566, + "flos": 65939333044320.0, + "grad_norm": 0.7709982444644254, + "language_loss": 0.62516469, + "learning_rate": 2.2478531177845564e-06, + "loss": 0.65279782, + "num_input_tokens_seen": 85246170, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.1015625, + "step": 3964, + "time_per_iteration": 3.5440657138824463 + }, + { + "auxiliary_loss_clip": 0.01513265, + "auxiliary_loss_mlp": 0.01288071, + "balance_loss_clip": 1.14286005, + "balance_loss_mlp": 1.02981651, + "epoch": 0.47676306138399566, + "flos": 24138935540640.0, + "grad_norm": 1.8000210532718282, + "language_loss": 0.85206366, + "learning_rate": 2.247080132143769e-06, + "loss": 0.880077, + "num_input_tokens_seen": 85268525, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 2.58203125, + "step": 3965, + "time_per_iteration": 3.0925631523132324 + }, + { + "auxiliary_loss_clip": 0.01506022, + "auxiliary_loss_mlp": 0.01296198, + "balance_loss_clip": 1.13588834, + "balance_loss_mlp": 1.03298378, + "epoch": 0.47688330427463477, + "flos": 12605966876160.0, + "grad_norm": 2.256663931308568, + "language_loss": 0.69411761, + "learning_rate": 2.246307109021121e-06, + "loss": 0.72213978, + "num_input_tokens_seen": 85285930, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 2.6328125, + "step": 3966, + "time_per_iteration": 3.1292884349823 + }, + { + "auxiliary_loss_clip": 0.01511222, + "auxiliary_loss_mlp": 0.01281646, + "balance_loss_clip": 1.14240491, + "balance_loss_mlp": 1.02548909, + "epoch": 0.4770035471652739, + "flos": 21392340510240.0, + "grad_norm": 1.9674895779984414, + "language_loss": 0.82403207, + "learning_rate": 2.2455340485338817e-06, + "loss": 0.85196078, + "num_input_tokens_seen": 85303565, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 2.56054688, + "step": 3967, + "time_per_iteration": 3.9665610790252686 + }, + { + "auxiliary_loss_clip": 0.01513141, + "auxiliary_loss_mlp": 0.0128471, + "balance_loss_clip": 1.14219344, + "balance_loss_mlp": 1.02817118, + "epoch": 0.47712379005591293, + "flos": 25158644323200.0, + "grad_norm": 2.3865295940665727, + "language_loss": 0.68097031, + "learning_rate": 2.244760950799322e-06, + "loss": 0.70894885, + "num_input_tokens_seen": 85321835, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 2.56445312, + "step": 3968, + "time_per_iteration": 4.021780967712402 + }, + { + "auxiliary_loss_clip": 0.01511667, + "auxiliary_loss_mlp": 0.01286069, + "balance_loss_clip": 1.14185214, + "balance_loss_mlp": 1.02724147, + "epoch": 0.47724403294655204, + "flos": 22056507077760.0, + "grad_norm": 2.216273835995496, + "language_loss": 0.72332913, + "learning_rate": 2.2439878159347203e-06, + "loss": 0.75130653, + "num_input_tokens_seen": 85341260, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 2.58789062, + "step": 3969, + "time_per_iteration": 3.159337043762207 + }, + { + "auxiliary_loss_clip": 0.01530647, + "auxiliary_loss_mlp": 0.01238754, + "balance_loss_clip": 1.16396999, + "balance_loss_mlp": 1.02742004, + "epoch": 0.4773642758371911, + "flos": 70237091335680.0, + "grad_norm": 0.7402579344486463, + "language_loss": 0.55210888, + "learning_rate": 2.2432146440573616e-06, + "loss": 0.57980287, + "num_input_tokens_seen": 85407220, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.1171875, + "step": 3970, + "time_per_iteration": 3.6408066749572754 + }, + { + "auxiliary_loss_clip": 0.0151307, + "auxiliary_loss_mlp": 0.01280391, + "balance_loss_clip": 1.1419692, + "balance_loss_mlp": 1.02308953, + "epoch": 0.4774845187278302, + "flos": 23550777734400.0, + "grad_norm": 2.7057373765768897, + "language_loss": 0.66182745, + "learning_rate": 2.242441435284534e-06, + "loss": 0.68976206, + "num_input_tokens_seen": 85426095, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 2.57226562, + "step": 3971, + "time_per_iteration": 3.9775781631469727 + }, + { + "auxiliary_loss_clip": 0.01511439, + "auxiliary_loss_mlp": 0.01287766, + "balance_loss_clip": 1.14244699, + "balance_loss_mlp": 1.0260781, + "epoch": 0.4776047616184693, + "flos": 23077581274080.0, + "grad_norm": 2.8104587540039057, + "language_loss": 0.8513571, + "learning_rate": 2.2416681897335337e-06, + "loss": 0.87934911, + "num_input_tokens_seen": 85444245, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 2.6171875, + "step": 3972, + "time_per_iteration": 3.197165012359619 + }, + { + "auxiliary_loss_clip": 0.01504736, + "auxiliary_loss_mlp": 0.01282814, + "balance_loss_clip": 1.13554776, + "balance_loss_mlp": 1.02246141, + "epoch": 0.4777250045091084, + "flos": 31900442162400.0, + "grad_norm": 1.83126547786647, + "language_loss": 0.67051542, + "learning_rate": 2.240894907521661e-06, + "loss": 0.6983909, + "num_input_tokens_seen": 85463325, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 2.60351562, + "step": 3973, + "time_per_iteration": 3.1619837284088135 + }, + { + "auxiliary_loss_clip": 0.01513865, + "auxiliary_loss_mlp": 0.01282533, + "balance_loss_clip": 1.14337814, + "balance_loss_mlp": 1.02408743, + "epoch": 0.4778452473997475, + "flos": 24280371106560.0, + "grad_norm": 2.1013553602151047, + "language_loss": 0.6365152, + "learning_rate": 2.240121588766223e-06, + "loss": 0.66447914, + "num_input_tokens_seen": 85483375, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 2.58398438, + "step": 3974, + "time_per_iteration": 3.8879847526550293 + }, + { + "auxiliary_loss_clip": 0.01511486, + "auxiliary_loss_mlp": 0.01276606, + "balance_loss_clip": 1.14108062, + "balance_loss_mlp": 1.0206399, + "epoch": 0.4779654902903866, + "flos": 31577632313760.0, + "grad_norm": 2.082089603096584, + "language_loss": 0.71667701, + "learning_rate": 2.239348233584531e-06, + "loss": 0.74455798, + "num_input_tokens_seen": 85504230, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 2.55859375, + "step": 3975, + "time_per_iteration": 3.195889472961426 + }, + { + "auxiliary_loss_clip": 0.01506762, + "auxiliary_loss_mlp": 0.01276487, + "balance_loss_clip": 1.13750315, + "balance_loss_mlp": 1.01975822, + "epoch": 0.47808573318102565, + "flos": 19502475137280.0, + "grad_norm": 1.9386903452876816, + "language_loss": 0.81025714, + "learning_rate": 2.2385748420939013e-06, + "loss": 0.83808964, + "num_input_tokens_seen": 85523425, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 2.56640625, + "step": 3976, + "time_per_iteration": 3.1052298545837402 + }, + { + "auxiliary_loss_clip": 0.01517894, + "auxiliary_loss_mlp": 0.01283901, + "balance_loss_clip": 1.14672375, + "balance_loss_mlp": 1.02583647, + "epoch": 0.47820597607166476, + "flos": 22603322825280.0, + "grad_norm": 2.079515216972022, + "language_loss": 0.72494769, + "learning_rate": 2.2378014144116583e-06, + "loss": 0.75296569, + "num_input_tokens_seen": 85542235, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 2.58007812, + "step": 3977, + "time_per_iteration": 3.00146746635437 + }, + { + "auxiliary_loss_clip": 0.01511425, + "auxiliary_loss_mlp": 0.01286492, + "balance_loss_clip": 1.14091027, + "balance_loss_mlp": 1.02709281, + "epoch": 0.4783262189623039, + "flos": 23005251544320.0, + "grad_norm": 2.252154222862582, + "language_loss": 0.80118316, + "learning_rate": 2.23702795065513e-06, + "loss": 0.82916236, + "num_input_tokens_seen": 85561815, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 2.59375, + "step": 3978, + "time_per_iteration": 3.0142714977264404 + }, + { + "auxiliary_loss_clip": 0.0152129, + "auxiliary_loss_mlp": 0.01195137, + "balance_loss_clip": 1.15599787, + "balance_loss_mlp": 0.98685455, + "epoch": 0.47844646185294293, + "flos": 49777982674560.0, + "grad_norm": 0.9953411181106514, + "language_loss": 0.67424512, + "learning_rate": 2.2362544509416493e-06, + "loss": 0.70140934, + "num_input_tokens_seen": 85613930, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.0859375, + "step": 3979, + "time_per_iteration": 3.367388963699341 + }, + { + "auxiliary_loss_clip": 0.015091, + "auxiliary_loss_mlp": 0.012826, + "balance_loss_clip": 1.13685644, + "balance_loss_mlp": 1.02548981, + "epoch": 0.47856670474358204, + "flos": 20231613371520.0, + "grad_norm": 2.4563322952824413, + "language_loss": 0.83009237, + "learning_rate": 2.2354809153885572e-06, + "loss": 0.85800946, + "num_input_tokens_seen": 85631000, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 2.5703125, + "step": 3980, + "time_per_iteration": 3.136157512664795 + }, + { + "auxiliary_loss_clip": 0.01507679, + "auxiliary_loss_mlp": 0.01284581, + "balance_loss_clip": 1.13649476, + "balance_loss_mlp": 1.02499056, + "epoch": 0.47868694763422115, + "flos": 20992763337120.0, + "grad_norm": 1.7654623691802145, + "language_loss": 0.83125335, + "learning_rate": 2.234707344113197e-06, + "loss": 0.85917598, + "num_input_tokens_seen": 85649095, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 2.59570312, + "step": 3981, + "time_per_iteration": 3.082878351211548 + }, + { + "auxiliary_loss_clip": 0.01513208, + "auxiliary_loss_mlp": 0.01280897, + "balance_loss_clip": 1.14247632, + "balance_loss_mlp": 1.02645659, + "epoch": 0.4788071905248602, + "flos": 19028595970080.0, + "grad_norm": 1.8011772766264964, + "language_loss": 0.78017348, + "learning_rate": 2.233933737232919e-06, + "loss": 0.80811447, + "num_input_tokens_seen": 85666875, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 2.54296875, + "step": 3982, + "time_per_iteration": 3.1462693214416504 + }, + { + "auxiliary_loss_clip": 0.01504901, + "auxiliary_loss_mlp": 0.01292301, + "balance_loss_clip": 1.13550782, + "balance_loss_mlp": 1.03499949, + "epoch": 0.4789274334154993, + "flos": 23004417124800.0, + "grad_norm": 2.076827338484813, + "language_loss": 0.78299081, + "learning_rate": 2.2331600948650793e-06, + "loss": 0.81096286, + "num_input_tokens_seen": 85687020, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 2.57226562, + "step": 3983, + "time_per_iteration": 3.0276947021484375 + }, + { + "auxiliary_loss_clip": 0.01512992, + "auxiliary_loss_mlp": 0.01297916, + "balance_loss_clip": 1.14381218, + "balance_loss_mlp": 1.03775358, + "epoch": 0.4790476763061384, + "flos": 23077808843040.0, + "grad_norm": 1.8166026816945944, + "language_loss": 0.80160749, + "learning_rate": 2.2323864171270386e-06, + "loss": 0.8297165, + "num_input_tokens_seen": 85708290, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 2.6015625, + "step": 3984, + "time_per_iteration": 3.0616650581359863 + }, + { + "auxiliary_loss_clip": 0.01512477, + "auxiliary_loss_mlp": 0.01298858, + "balance_loss_clip": 1.14301467, + "balance_loss_mlp": 1.03697896, + "epoch": 0.4791679191967775, + "flos": 21181457754720.0, + "grad_norm": 1.8498384203186629, + "language_loss": 0.72473621, + "learning_rate": 2.231612704136164e-06, + "loss": 0.75284952, + "num_input_tokens_seen": 85728660, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 2.61914062, + "step": 3985, + "time_per_iteration": 3.0505802631378174 + }, + { + "auxiliary_loss_clip": 0.01518537, + "auxiliary_loss_mlp": 0.01305152, + "balance_loss_clip": 1.14864016, + "balance_loss_mlp": 1.04384577, + "epoch": 0.4792881620874166, + "flos": 22303232308800.0, + "grad_norm": 4.3180655893441084, + "language_loss": 0.75334388, + "learning_rate": 2.2308389560098253e-06, + "loss": 0.78158075, + "num_input_tokens_seen": 85745035, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 2.61328125, + "step": 3986, + "time_per_iteration": 3.1982829570770264 + }, + { + "auxiliary_loss_clip": 0.0151813, + "auxiliary_loss_mlp": 0.01307525, + "balance_loss_clip": 1.15003967, + "balance_loss_mlp": 1.04583669, + "epoch": 0.47940840497805565, + "flos": 17422815430080.0, + "grad_norm": 2.225422178852064, + "language_loss": 0.77035826, + "learning_rate": 2.2300651728654008e-06, + "loss": 0.79861486, + "num_input_tokens_seen": 85760295, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 2.6171875, + "step": 3987, + "time_per_iteration": 3.202852249145508 + }, + { + "auxiliary_loss_clip": 0.015263, + "auxiliary_loss_mlp": 0.0125724, + "balance_loss_clip": 1.16221571, + "balance_loss_mlp": 1.04438019, + "epoch": 0.47952864786869476, + "flos": 65364563878560.0, + "grad_norm": 0.9677449537607838, + "language_loss": 0.60163915, + "learning_rate": 2.229291354820272e-06, + "loss": 0.62947452, + "num_input_tokens_seen": 85821305, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.1328125, + "step": 3988, + "time_per_iteration": 3.5997891426086426 + }, + { + "auxiliary_loss_clip": 0.01516608, + "auxiliary_loss_mlp": 0.01293901, + "balance_loss_clip": 1.14776254, + "balance_loss_mlp": 1.03297615, + "epoch": 0.47964889075933387, + "flos": 16801659760320.0, + "grad_norm": 2.3678157420003485, + "language_loss": 0.75993401, + "learning_rate": 2.228517501991828e-06, + "loss": 0.78803909, + "num_input_tokens_seen": 85840105, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 2.609375, + "step": 3989, + "time_per_iteration": 3.1712024211883545 + }, + { + "auxiliary_loss_clip": 0.01523121, + "auxiliary_loss_mlp": 0.01214943, + "balance_loss_clip": 1.15939665, + "balance_loss_mlp": 1.0036087, + "epoch": 0.4797691336499729, + "flos": 70086704724000.0, + "grad_norm": 0.8143953355666205, + "language_loss": 0.61060119, + "learning_rate": 2.22774361449746e-06, + "loss": 0.63798183, + "num_input_tokens_seen": 85896585, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.1171875, + "step": 3990, + "time_per_iteration": 3.5183944702148438 + }, + { + "auxiliary_loss_clip": 0.01513597, + "auxiliary_loss_mlp": 0.01281515, + "balance_loss_clip": 1.14509511, + "balance_loss_mlp": 1.02078021, + "epoch": 0.47988937654061203, + "flos": 18955355964480.0, + "grad_norm": 4.159878784766825, + "language_loss": 0.70937139, + "learning_rate": 2.2269696924545668e-06, + "loss": 0.73732245, + "num_input_tokens_seen": 85914415, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 2.60742188, + "step": 3991, + "time_per_iteration": 3.138087272644043 + }, + { + "auxiliary_loss_clip": 0.01523089, + "auxiliary_loss_mlp": 0.01297062, + "balance_loss_clip": 1.15338802, + "balance_loss_mlp": 1.03918815, + "epoch": 0.48000961943125114, + "flos": 14463555020640.0, + "grad_norm": 3.0693031175557253, + "language_loss": 0.77908087, + "learning_rate": 2.2261957359805523e-06, + "loss": 0.80728239, + "num_input_tokens_seen": 85931650, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 2.578125, + "step": 3992, + "time_per_iteration": 3.171454668045044 + }, + { + "auxiliary_loss_clip": 0.01513052, + "auxiliary_loss_mlp": 0.01286453, + "balance_loss_clip": 1.14409089, + "balance_loss_mlp": 1.02705348, + "epoch": 0.4801298623218902, + "flos": 27053516213280.0, + "grad_norm": 2.2903695170594154, + "language_loss": 0.74235582, + "learning_rate": 2.225421745192823e-06, + "loss": 0.77035087, + "num_input_tokens_seen": 85951805, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 2.59375, + "step": 3993, + "time_per_iteration": 3.186426877975464 + }, + { + "auxiliary_loss_clip": 0.01513539, + "auxiliary_loss_mlp": 0.01287169, + "balance_loss_clip": 1.145046, + "balance_loss_mlp": 1.02872348, + "epoch": 0.4802501052125293, + "flos": 26357413770720.0, + "grad_norm": 2.0075858563220907, + "language_loss": 0.78475142, + "learning_rate": 2.2246477202087955e-06, + "loss": 0.81275851, + "num_input_tokens_seen": 85972485, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 2.58398438, + "step": 3994, + "time_per_iteration": 3.133002996444702 + }, + { + "auxiliary_loss_clip": 0.01517158, + "auxiliary_loss_mlp": 0.01285339, + "balance_loss_clip": 1.14837551, + "balance_loss_mlp": 1.03147054, + "epoch": 0.4803703481031684, + "flos": 20995911374400.0, + "grad_norm": 1.5869376626230796, + "language_loss": 0.82845259, + "learning_rate": 2.223873661145887e-06, + "loss": 0.8564775, + "num_input_tokens_seen": 85992540, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 2.53710938, + "step": 3995, + "time_per_iteration": 4.872479677200317 + }, + { + "auxiliary_loss_clip": 0.01510446, + "auxiliary_loss_mlp": 0.01279275, + "balance_loss_clip": 1.14204729, + "balance_loss_mlp": 1.02292705, + "epoch": 0.4804905909938075, + "flos": 20705795964000.0, + "grad_norm": 2.051412705035601, + "language_loss": 0.71070206, + "learning_rate": 2.2230995681215226e-06, + "loss": 0.7385993, + "num_input_tokens_seen": 86012065, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 2.5625, + "step": 3996, + "time_per_iteration": 3.1915900707244873 + }, + { + "auxiliary_loss_clip": 0.01515219, + "auxiliary_loss_mlp": 0.01286411, + "balance_loss_clip": 1.14702439, + "balance_loss_mlp": 1.02720189, + "epoch": 0.4806108338844466, + "flos": 16656734803680.0, + "grad_norm": 37.59961658624629, + "language_loss": 0.78083599, + "learning_rate": 2.2223254412531305e-06, + "loss": 0.80885231, + "num_input_tokens_seen": 86029435, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 2.59179688, + "step": 3997, + "time_per_iteration": 3.210669755935669 + }, + { + "auxiliary_loss_clip": 0.0150704, + "auxiliary_loss_mlp": 0.01283304, + "balance_loss_clip": 1.13812852, + "balance_loss_mlp": 1.02924514, + "epoch": 0.4807310767750857, + "flos": 20013410481120.0, + "grad_norm": 1.8244024996613295, + "language_loss": 0.82566231, + "learning_rate": 2.221551280658146e-06, + "loss": 0.85356575, + "num_input_tokens_seen": 86048495, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 2.5390625, + "step": 3998, + "time_per_iteration": 3.17797589302063 + }, + { + "auxiliary_loss_clip": 0.01515398, + "auxiliary_loss_mlp": 0.01287673, + "balance_loss_clip": 1.14781857, + "balance_loss_mlp": 1.03094435, + "epoch": 0.48085131966572475, + "flos": 23187498174720.0, + "grad_norm": 1.8612784174439942, + "language_loss": 0.74216074, + "learning_rate": 2.2207770864540085e-06, + "loss": 0.77019143, + "num_input_tokens_seen": 86067470, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.56640625, + "step": 3999, + "time_per_iteration": 3.9884607791900635 + }, + { + "auxiliary_loss_clip": 0.01513682, + "auxiliary_loss_mlp": 0.01284886, + "balance_loss_clip": 1.14580882, + "balance_loss_mlp": 1.02872896, + "epoch": 0.48097156255636386, + "flos": 20560757222880.0, + "grad_norm": 3.6045822725487566, + "language_loss": 0.72995329, + "learning_rate": 2.220002858758162e-06, + "loss": 0.75793898, + "num_input_tokens_seen": 86085460, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.56054688, + "step": 4000, + "time_per_iteration": 3.1429953575134277 + }, + { + "auxiliary_loss_clip": 0.01510842, + "auxiliary_loss_mlp": 0.01240791, + "balance_loss_clip": 1.14905369, + "balance_loss_mlp": 1.03403473, + "epoch": 0.481091805447003, + "flos": 70518786694560.0, + "grad_norm": 2.959069232779965, + "language_loss": 0.6084699, + "learning_rate": 2.2192285976880573e-06, + "loss": 0.63598627, + "num_input_tokens_seen": 86149715, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.0703125, + "step": 4001, + "time_per_iteration": 3.5059573650360107 + }, + { + "auxiliary_loss_clip": 0.01509127, + "auxiliary_loss_mlp": 0.01299136, + "balance_loss_clip": 1.14208746, + "balance_loss_mlp": 1.04183483, + "epoch": 0.48121204833764203, + "flos": 36431119834560.0, + "grad_norm": 1.58209483021021, + "language_loss": 0.80542135, + "learning_rate": 2.2184543033611485e-06, + "loss": 0.83350402, + "num_input_tokens_seen": 86170795, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.57226562, + "step": 4002, + "time_per_iteration": 3.9383254051208496 + }, + { + "auxiliary_loss_clip": 0.01503125, + "auxiliary_loss_mlp": 0.0128615, + "balance_loss_clip": 1.13529444, + "balance_loss_mlp": 1.02903974, + "epoch": 0.48133229122828114, + "flos": 27492728677920.0, + "grad_norm": 2.5728725506612253, + "language_loss": 0.8207013, + "learning_rate": 2.2176799758948957e-06, + "loss": 0.84859395, + "num_input_tokens_seen": 86190955, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.5703125, + "step": 4003, + "time_per_iteration": 3.0709590911865234 + }, + { + "auxiliary_loss_clip": 0.01509218, + "auxiliary_loss_mlp": 0.01291964, + "balance_loss_clip": 1.14139271, + "balance_loss_mlp": 1.03313637, + "epoch": 0.4814525341189202, + "flos": 43076313557280.0, + "grad_norm": 2.4951231946871415, + "language_loss": 0.73320639, + "learning_rate": 2.2169056154067635e-06, + "loss": 0.76121819, + "num_input_tokens_seen": 86214875, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.58789062, + "step": 4004, + "time_per_iteration": 3.21045184135437 + }, + { + "auxiliary_loss_clip": 0.01509395, + "auxiliary_loss_mlp": 0.01299266, + "balance_loss_clip": 1.14182615, + "balance_loss_mlp": 1.04101145, + "epoch": 0.4815727770095593, + "flos": 24238839407040.0, + "grad_norm": 1.8017823800768034, + "language_loss": 0.82745177, + "learning_rate": 2.216131222014222e-06, + "loss": 0.85553837, + "num_input_tokens_seen": 86232950, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.58203125, + "step": 4005, + "time_per_iteration": 3.1966538429260254 + }, + { + "auxiliary_loss_clip": 0.01516083, + "auxiliary_loss_mlp": 0.01290589, + "balance_loss_clip": 1.1501894, + "balance_loss_mlp": 1.03405046, + "epoch": 0.4816930199001984, + "flos": 18115580194560.0, + "grad_norm": 2.6673496261523706, + "language_loss": 0.80301756, + "learning_rate": 2.2153567958347455e-06, + "loss": 0.83108431, + "num_input_tokens_seen": 86249160, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.56445312, + "step": 4006, + "time_per_iteration": 3.307234764099121 + }, + { + "auxiliary_loss_clip": 0.01512027, + "auxiliary_loss_mlp": 0.01295373, + "balance_loss_clip": 1.14359426, + "balance_loss_mlp": 1.03692746, + "epoch": 0.48181326279083747, + "flos": 17276638844160.0, + "grad_norm": 3.0564265608818904, + "language_loss": 0.80395645, + "learning_rate": 2.214582336985815e-06, + "loss": 0.83203042, + "num_input_tokens_seen": 86267060, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 2.58398438, + "step": 4007, + "time_per_iteration": 3.145305633544922 + }, + { + "auxiliary_loss_clip": 0.01506733, + "auxiliary_loss_mlp": 0.01289727, + "balance_loss_clip": 1.14008594, + "balance_loss_mlp": 1.03109026, + "epoch": 0.4819335056814766, + "flos": 14905043174880.0, + "grad_norm": 2.2853304421653187, + "language_loss": 0.6619007, + "learning_rate": 2.2138078455849142e-06, + "loss": 0.68986535, + "num_input_tokens_seen": 86285055, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.5859375, + "step": 4008, + "time_per_iteration": 3.169419288635254 + }, + { + "auxiliary_loss_clip": 0.01506208, + "auxiliary_loss_mlp": 0.01289782, + "balance_loss_clip": 1.13865125, + "balance_loss_mlp": 1.03267097, + "epoch": 0.4820537485721157, + "flos": 19246912644960.0, + "grad_norm": 4.626117406171727, + "language_loss": 0.78763878, + "learning_rate": 2.2130333217495334e-06, + "loss": 0.81559861, + "num_input_tokens_seen": 86304225, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.5703125, + "step": 4009, + "time_per_iteration": 3.1764440536499023 + }, + { + "auxiliary_loss_clip": 0.01502116, + "auxiliary_loss_mlp": 0.0130002, + "balance_loss_clip": 1.13458431, + "balance_loss_mlp": 1.03985751, + "epoch": 0.48217399146275475, + "flos": 16035617062080.0, + "grad_norm": 3.9219711726019244, + "language_loss": 0.68629503, + "learning_rate": 2.2122587655971665e-06, + "loss": 0.71431637, + "num_input_tokens_seen": 86319170, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.6015625, + "step": 4010, + "time_per_iteration": 3.134382724761963 + }, + { + "auxiliary_loss_clip": 0.01500623, + "auxiliary_loss_mlp": 0.01293685, + "balance_loss_clip": 1.13221884, + "balance_loss_mlp": 1.03466761, + "epoch": 0.48229423435339386, + "flos": 24136508138400.0, + "grad_norm": 2.145127002043556, + "language_loss": 0.63929117, + "learning_rate": 2.211484177245314e-06, + "loss": 0.66723418, + "num_input_tokens_seen": 86338760, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 2.58984375, + "step": 4011, + "time_per_iteration": 3.1439692974090576 + }, + { + "auxiliary_loss_clip": 0.01506023, + "auxiliary_loss_mlp": 0.0129373, + "balance_loss_clip": 1.14076722, + "balance_loss_mlp": 1.03433084, + "epoch": 0.48241447724403297, + "flos": 23807705640480.0, + "grad_norm": 1.9461349639350145, + "language_loss": 0.72423863, + "learning_rate": 2.21070955681148e-06, + "loss": 0.75223619, + "num_input_tokens_seen": 86357865, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.59375, + "step": 4012, + "time_per_iteration": 3.1316940784454346 + }, + { + "auxiliary_loss_clip": 0.01503215, + "auxiliary_loss_mlp": 0.01284826, + "balance_loss_clip": 1.1368649, + "balance_loss_mlp": 1.02752495, + "epoch": 0.482534720134672, + "flos": 23112437617440.0, + "grad_norm": 1.9844186093505307, + "language_loss": 0.78285116, + "learning_rate": 2.209934904413174e-06, + "loss": 0.81073159, + "num_input_tokens_seen": 86379470, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.57226562, + "step": 4013, + "time_per_iteration": 3.0573127269744873 + }, + { + "auxiliary_loss_clip": 0.01507787, + "auxiliary_loss_mlp": 0.01297883, + "balance_loss_clip": 1.14083123, + "balance_loss_mlp": 1.03619504, + "epoch": 0.48265496302531113, + "flos": 20925971118720.0, + "grad_norm": 2.039373398879242, + "language_loss": 0.71650857, + "learning_rate": 2.2091602201679095e-06, + "loss": 0.74456531, + "num_input_tokens_seen": 86399080, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.6171875, + "step": 4014, + "time_per_iteration": 3.0959486961364746 + }, + { + "auxiliary_loss_clip": 0.01508703, + "auxiliary_loss_mlp": 0.01284491, + "balance_loss_clip": 1.14293504, + "balance_loss_mlp": 1.02776146, + "epoch": 0.48277520591595025, + "flos": 15232935396960.0, + "grad_norm": 4.274158239449456, + "language_loss": 0.83441478, + "learning_rate": 2.208385504193206e-06, + "loss": 0.86234671, + "num_input_tokens_seen": 86416580, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.56640625, + "step": 4015, + "time_per_iteration": 2.992797613143921 + }, + { + "auxiliary_loss_clip": 0.01503365, + "auxiliary_loss_mlp": 0.01297975, + "balance_loss_clip": 1.13569355, + "balance_loss_mlp": 1.04048276, + "epoch": 0.4828954488065893, + "flos": 17860624552800.0, + "grad_norm": 2.073920883078514, + "language_loss": 0.81190288, + "learning_rate": 2.2076107566065873e-06, + "loss": 0.83991635, + "num_input_tokens_seen": 86434365, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.57421875, + "step": 4016, + "time_per_iteration": 3.036285877227783 + }, + { + "auxiliary_loss_clip": 0.01507985, + "auxiliary_loss_mlp": 0.01294365, + "balance_loss_clip": 1.13907361, + "balance_loss_mlp": 1.0370636, + "epoch": 0.4830156916972284, + "flos": 32090122712160.0, + "grad_norm": 12.097521248084039, + "language_loss": 0.75913811, + "learning_rate": 2.2068359775255816e-06, + "loss": 0.78716153, + "num_input_tokens_seen": 86452675, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 2.57226562, + "step": 4017, + "time_per_iteration": 3.099419355392456 + }, + { + "auxiliary_loss_clip": 0.01505739, + "auxiliary_loss_mlp": 0.01278995, + "balance_loss_clip": 1.13794208, + "balance_loss_mlp": 1.0220753, + "epoch": 0.48313593458786747, + "flos": 21873881165760.0, + "grad_norm": 3.08706900356911, + "language_loss": 0.78740072, + "learning_rate": 2.206061167067723e-06, + "loss": 0.81524807, + "num_input_tokens_seen": 86470785, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 2.56835938, + "step": 4018, + "time_per_iteration": 2.9456803798675537 + }, + { + "auxiliary_loss_clip": 0.0151142, + "auxiliary_loss_mlp": 0.01285641, + "balance_loss_clip": 1.14384031, + "balance_loss_mlp": 1.02509665, + "epoch": 0.4832561774785066, + "flos": 22603398681600.0, + "grad_norm": 2.2122750835847533, + "language_loss": 0.79646891, + "learning_rate": 2.205286325350549e-06, + "loss": 0.82443953, + "num_input_tokens_seen": 86489850, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.60546875, + "step": 4019, + "time_per_iteration": 3.068233013153076 + }, + { + "auxiliary_loss_clip": 0.0150392, + "auxiliary_loss_mlp": 0.01287895, + "balance_loss_clip": 1.13754642, + "balance_loss_mlp": 1.02887654, + "epoch": 0.4833764203691457, + "flos": 13438688008320.0, + "grad_norm": 2.372806562852761, + "language_loss": 0.7282356, + "learning_rate": 2.204511452491603e-06, + "loss": 0.7561537, + "num_input_tokens_seen": 86506475, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.58984375, + "step": 4020, + "time_per_iteration": 3.129607915878296 + }, + { + "auxiliary_loss_clip": 0.01509763, + "auxiliary_loss_mlp": 0.01283492, + "balance_loss_clip": 1.14328527, + "balance_loss_mlp": 1.02790713, + "epoch": 0.48349666325978474, + "flos": 44131447605600.0, + "grad_norm": 1.9058589481700223, + "language_loss": 0.74994814, + "learning_rate": 2.2037365486084316e-06, + "loss": 0.77788079, + "num_input_tokens_seen": 86529715, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.5546875, + "step": 4021, + "time_per_iteration": 3.2061901092529297 + }, + { + "auxiliary_loss_clip": 0.01504435, + "auxiliary_loss_mlp": 0.01288777, + "balance_loss_clip": 1.13768768, + "balance_loss_mlp": 1.02956843, + "epoch": 0.48361690615042385, + "flos": 26030621465280.0, + "grad_norm": 2.1912201060678127, + "language_loss": 0.77712536, + "learning_rate": 2.2029616138185886e-06, + "loss": 0.80505747, + "num_input_tokens_seen": 86548715, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.59179688, + "step": 4022, + "time_per_iteration": 4.7136735916137695 + }, + { + "auxiliary_loss_clip": 0.01508215, + "auxiliary_loss_mlp": 0.01284403, + "balance_loss_clip": 1.14328265, + "balance_loss_mlp": 1.02595758, + "epoch": 0.48373714904106296, + "flos": 22275506459520.0, + "grad_norm": 1.717587112742894, + "language_loss": 0.82971805, + "learning_rate": 2.202186648239629e-06, + "loss": 0.8576442, + "num_input_tokens_seen": 86568650, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.58398438, + "step": 4023, + "time_per_iteration": 3.080834150314331 + }, + { + "auxiliary_loss_clip": 0.01503675, + "auxiliary_loss_mlp": 0.01283645, + "balance_loss_clip": 1.13777637, + "balance_loss_mlp": 1.02558029, + "epoch": 0.483857391931702, + "flos": 28294765564320.0, + "grad_norm": 4.3440968910588, + "language_loss": 0.715114, + "learning_rate": 2.201411651989117e-06, + "loss": 0.74298716, + "num_input_tokens_seen": 86590630, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.58007812, + "step": 4024, + "time_per_iteration": 2.9833195209503174 + }, + { + "auxiliary_loss_clip": 0.01503731, + "auxiliary_loss_mlp": 0.01277821, + "balance_loss_clip": 1.13666677, + "balance_loss_mlp": 1.02204561, + "epoch": 0.48397763482234113, + "flos": 27420550660800.0, + "grad_norm": 1.9172161566046324, + "language_loss": 0.78215623, + "learning_rate": 2.2006366251846167e-06, + "loss": 0.80997169, + "num_input_tokens_seen": 86611270, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 2.55664062, + "step": 4025, + "time_per_iteration": 3.0638811588287354 + }, + { + "auxiliary_loss_clip": 0.01510481, + "auxiliary_loss_mlp": 0.01284861, + "balance_loss_clip": 1.14308333, + "balance_loss_mlp": 1.02775049, + "epoch": 0.48409787771298024, + "flos": 16798966860960.0, + "grad_norm": 1.9952636824856993, + "language_loss": 0.75381541, + "learning_rate": 2.1998615679436997e-06, + "loss": 0.78176892, + "num_input_tokens_seen": 86628810, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 2.5703125, + "step": 4026, + "time_per_iteration": 3.8688628673553467 + }, + { + "auxiliary_loss_clip": 0.01504263, + "auxiliary_loss_mlp": 0.01299935, + "balance_loss_clip": 1.13716364, + "balance_loss_mlp": 1.03920007, + "epoch": 0.4842181206036193, + "flos": 25085897383680.0, + "grad_norm": 2.612170116029644, + "language_loss": 0.77501225, + "learning_rate": 2.199086480383942e-06, + "loss": 0.80305421, + "num_input_tokens_seen": 86648185, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.60742188, + "step": 4027, + "time_per_iteration": 2.9782001972198486 + }, + { + "auxiliary_loss_clip": 0.01512325, + "auxiliary_loss_mlp": 0.01294257, + "balance_loss_clip": 1.14571643, + "balance_loss_mlp": 1.02875447, + "epoch": 0.4843383634942584, + "flos": 30374918337600.0, + "grad_norm": 3.9811254144013892, + "language_loss": 0.6711722, + "learning_rate": 2.1983113626229234e-06, + "loss": 0.69923806, + "num_input_tokens_seen": 86667435, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.65625, + "step": 4028, + "time_per_iteration": 3.099151849746704 + }, + { + "auxiliary_loss_clip": 0.01507273, + "auxiliary_loss_mlp": 0.01295624, + "balance_loss_clip": 1.14060295, + "balance_loss_mlp": 1.03679717, + "epoch": 0.4844586063848975, + "flos": 20415718481760.0, + "grad_norm": 6.132793389438592, + "language_loss": 0.78680682, + "learning_rate": 2.1975362147782293e-06, + "loss": 0.81483579, + "num_input_tokens_seen": 86686630, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.58789062, + "step": 4029, + "time_per_iteration": 3.9004967212677 + }, + { + "auxiliary_loss_clip": 0.01509558, + "auxiliary_loss_mlp": 0.01222427, + "balance_loss_clip": 1.1483562, + "balance_loss_mlp": 1.00956726, + "epoch": 0.48457884927553657, + "flos": 70311507114240.0, + "grad_norm": 0.7350759813923087, + "language_loss": 0.54153907, + "learning_rate": 2.196761036967448e-06, + "loss": 0.56885886, + "num_input_tokens_seen": 86754595, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.1328125, + "step": 4030, + "time_per_iteration": 3.5393896102905273 + }, + { + "auxiliary_loss_clip": 0.01500308, + "auxiliary_loss_mlp": 0.01285007, + "balance_loss_clip": 1.1323607, + "balance_loss_mlp": 1.02770615, + "epoch": 0.4846990921661757, + "flos": 19936377659520.0, + "grad_norm": 1.676658861813676, + "language_loss": 0.77763492, + "learning_rate": 2.1959858293081743e-06, + "loss": 0.80548805, + "num_input_tokens_seen": 86773730, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 2.57226562, + "step": 4031, + "time_per_iteration": 3.0278966426849365 + }, + { + "auxiliary_loss_clip": 0.01506735, + "auxiliary_loss_mlp": 0.01292663, + "balance_loss_clip": 1.13863993, + "balance_loss_mlp": 1.0342176, + "epoch": 0.4848193350568148, + "flos": 23078188124640.0, + "grad_norm": 2.9117542679266095, + "language_loss": 0.75896722, + "learning_rate": 2.1952105919180056e-06, + "loss": 0.7869612, + "num_input_tokens_seen": 86792985, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.58398438, + "step": 4032, + "time_per_iteration": 3.0223031044006348 + }, + { + "auxiliary_loss_clip": 0.01504096, + "auxiliary_loss_mlp": 0.01283874, + "balance_loss_clip": 1.13604343, + "balance_loss_mlp": 1.02523732, + "epoch": 0.48493957794745385, + "flos": 22457677233600.0, + "grad_norm": 3.020101577278237, + "language_loss": 0.67801011, + "learning_rate": 2.1944353249145456e-06, + "loss": 0.70588982, + "num_input_tokens_seen": 86812095, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.5859375, + "step": 4033, + "time_per_iteration": 2.9935977458953857 + }, + { + "auxiliary_loss_clip": 0.01510607, + "auxiliary_loss_mlp": 0.01290243, + "balance_loss_clip": 1.14546657, + "balance_loss_mlp": 1.03256035, + "epoch": 0.48505982083809296, + "flos": 25048461925440.0, + "grad_norm": 1.7525244971367029, + "language_loss": 0.74895155, + "learning_rate": 2.193660028415401e-06, + "loss": 0.77696002, + "num_input_tokens_seen": 86832875, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.57617188, + "step": 4034, + "time_per_iteration": 2.994605779647827 + }, + { + "auxiliary_loss_clip": 0.01504997, + "auxiliary_loss_mlp": 0.01286362, + "balance_loss_clip": 1.13730097, + "balance_loss_mlp": 1.0277257, + "epoch": 0.485180063728732, + "flos": 26763855940800.0, + "grad_norm": 1.8769914324021395, + "language_loss": 0.82250828, + "learning_rate": 2.1928847025381852e-06, + "loss": 0.85042179, + "num_input_tokens_seen": 86853480, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 2.5859375, + "step": 4035, + "time_per_iteration": 2.9767634868621826 + }, + { + "auxiliary_loss_clip": 0.01499696, + "auxiliary_loss_mlp": 0.01294628, + "balance_loss_clip": 1.13268876, + "balance_loss_mlp": 1.03236735, + "epoch": 0.4853003066193711, + "flos": 24061333796640.0, + "grad_norm": 1.8958790635411116, + "language_loss": 0.84261686, + "learning_rate": 2.192109347400512e-06, + "loss": 0.87056011, + "num_input_tokens_seen": 86873695, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.62304688, + "step": 4036, + "time_per_iteration": 2.98995041847229 + }, + { + "auxiliary_loss_clip": 0.01507962, + "auxiliary_loss_mlp": 0.01282135, + "balance_loss_clip": 1.14183772, + "balance_loss_mlp": 1.02235448, + "epoch": 0.48542054951001024, + "flos": 23078643262560.0, + "grad_norm": 1.9193530546119661, + "language_loss": 0.78775561, + "learning_rate": 2.191333963120004e-06, + "loss": 0.81565654, + "num_input_tokens_seen": 86892675, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.59765625, + "step": 4037, + "time_per_iteration": 2.964259386062622 + }, + { + "auxiliary_loss_clip": 0.0149873, + "auxiliary_loss_mlp": 0.01295444, + "balance_loss_clip": 1.13187933, + "balance_loss_mlp": 1.03394687, + "epoch": 0.4855407924006493, + "flos": 25667190192960.0, + "grad_norm": 3.8749889854914015, + "language_loss": 0.71011102, + "learning_rate": 2.190558549814286e-06, + "loss": 0.73805279, + "num_input_tokens_seen": 86912835, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.61523438, + "step": 4038, + "time_per_iteration": 2.987548351287842 + }, + { + "auxiliary_loss_clip": 0.0150946, + "auxiliary_loss_mlp": 0.01297616, + "balance_loss_clip": 1.14242649, + "balance_loss_mlp": 1.03764427, + "epoch": 0.4856610352912884, + "flos": 23990217768000.0, + "grad_norm": 1.9966539887056463, + "language_loss": 0.79622263, + "learning_rate": 2.1897831076009872e-06, + "loss": 0.82429338, + "num_input_tokens_seen": 86932475, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.59960938, + "step": 4039, + "time_per_iteration": 3.0341877937316895 + }, + { + "auxiliary_loss_clip": 0.01503959, + "auxiliary_loss_mlp": 0.01285123, + "balance_loss_clip": 1.13683128, + "balance_loss_mlp": 1.0268681, + "epoch": 0.4857812781819275, + "flos": 24099072680160.0, + "grad_norm": 2.039443113286254, + "language_loss": 0.8006382, + "learning_rate": 2.1890076365977426e-06, + "loss": 0.82852906, + "num_input_tokens_seen": 86952300, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 2.58203125, + "step": 4040, + "time_per_iteration": 2.9720237255096436 + }, + { + "auxiliary_loss_clip": 0.0151226, + "auxiliary_loss_mlp": 0.01204109, + "balance_loss_clip": 1.15152359, + "balance_loss_mlp": 0.99582672, + "epoch": 0.48590152107256657, + "flos": 56272145006880.0, + "grad_norm": 0.8602647701214472, + "language_loss": 0.52818102, + "learning_rate": 2.188232136922189e-06, + "loss": 0.55534476, + "num_input_tokens_seen": 87010420, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.0859375, + "step": 4041, + "time_per_iteration": 3.349543571472168 + }, + { + "auxiliary_loss_clip": 0.0150388, + "auxiliary_loss_mlp": 0.01285047, + "balance_loss_clip": 1.13829744, + "balance_loss_mlp": 1.02602923, + "epoch": 0.4860217639632057, + "flos": 20049025387680.0, + "grad_norm": 2.2331447449629263, + "language_loss": 0.75937814, + "learning_rate": 2.187456608691971e-06, + "loss": 0.78726739, + "num_input_tokens_seen": 87029295, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.58984375, + "step": 4042, + "time_per_iteration": 2.983757972717285 + }, + { + "auxiliary_loss_clip": 0.01512237, + "auxiliary_loss_mlp": 0.01279158, + "balance_loss_clip": 1.1471827, + "balance_loss_mlp": 1.02185714, + "epoch": 0.4861420068538448, + "flos": 17824099370400.0, + "grad_norm": 3.873982333482832, + "language_loss": 0.87534344, + "learning_rate": 2.1866810520247334e-06, + "loss": 0.90325743, + "num_input_tokens_seen": 87048165, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.57226562, + "step": 4043, + "time_per_iteration": 2.9769582748413086 + }, + { + "auxiliary_loss_clip": 0.015084, + "auxiliary_loss_mlp": 0.01295728, + "balance_loss_clip": 1.14312184, + "balance_loss_mlp": 1.0376637, + "epoch": 0.48626224974448384, + "flos": 26252541315360.0, + "grad_norm": 2.55264454089206, + "language_loss": 0.64646518, + "learning_rate": 2.185905467038129e-06, + "loss": 0.67450649, + "num_input_tokens_seen": 87067070, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.58007812, + "step": 4044, + "time_per_iteration": 3.1015446186065674 + }, + { + "auxiliary_loss_clip": 0.01513528, + "auxiliary_loss_mlp": 0.01296048, + "balance_loss_clip": 1.14830256, + "balance_loss_mlp": 1.03989124, + "epoch": 0.48638249263512295, + "flos": 22056696718560.0, + "grad_norm": 1.729553883478974, + "language_loss": 0.77753687, + "learning_rate": 2.1851298538498127e-06, + "loss": 0.80563259, + "num_input_tokens_seen": 87086785, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.56054688, + "step": 4045, + "time_per_iteration": 3.030442476272583 + }, + { + "auxiliary_loss_clip": 0.01514734, + "auxiliary_loss_mlp": 0.01289014, + "balance_loss_clip": 1.15022933, + "balance_loss_mlp": 1.02675366, + "epoch": 0.48650273552576206, + "flos": 25122346709760.0, + "grad_norm": 5.343274397214431, + "language_loss": 0.80132973, + "learning_rate": 2.184354212577446e-06, + "loss": 0.82936728, + "num_input_tokens_seen": 87107090, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.62304688, + "step": 4046, + "time_per_iteration": 3.0578057765960693 + }, + { + "auxiliary_loss_clip": 0.01510335, + "auxiliary_loss_mlp": 0.01285379, + "balance_loss_clip": 1.14647651, + "balance_loss_mlp": 1.0244534, + "epoch": 0.4866229784164011, + "flos": 17458847546400.0, + "grad_norm": 4.333000137640583, + "language_loss": 0.63096797, + "learning_rate": 2.1835785433386907e-06, + "loss": 0.65892512, + "num_input_tokens_seen": 87125905, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.609375, + "step": 4047, + "time_per_iteration": 3.0624616146087646 + }, + { + "auxiliary_loss_clip": 0.01509646, + "auxiliary_loss_mlp": 0.01286577, + "balance_loss_clip": 1.14460933, + "balance_loss_mlp": 1.02622342, + "epoch": 0.48674322130704023, + "flos": 23333371335360.0, + "grad_norm": 1.8881624423584389, + "language_loss": 0.6544615, + "learning_rate": 2.182802846251216e-06, + "loss": 0.68242371, + "num_input_tokens_seen": 87146175, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.60351562, + "step": 4048, + "time_per_iteration": 3.0372743606567383 + }, + { + "auxiliary_loss_clip": 0.01502719, + "auxiliary_loss_mlp": 0.0129733, + "balance_loss_clip": 1.13772869, + "balance_loss_mlp": 1.03888381, + "epoch": 0.4868634641976793, + "flos": 28806914609280.0, + "grad_norm": 1.9605692038080536, + "language_loss": 0.72109842, + "learning_rate": 2.182027121432696e-06, + "loss": 0.7490989, + "num_input_tokens_seen": 87166800, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.58398438, + "step": 4049, + "time_per_iteration": 4.807246923446655 + }, + { + "auxiliary_loss_clip": 0.01507316, + "auxiliary_loss_mlp": 0.01293208, + "balance_loss_clip": 1.14123881, + "balance_loss_mlp": 1.0326637, + "epoch": 0.4869837070883184, + "flos": 19027989119520.0, + "grad_norm": 2.125993148822212, + "language_loss": 0.82619774, + "learning_rate": 2.1812513690008054e-06, + "loss": 0.85420299, + "num_input_tokens_seen": 87185920, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.60546875, + "step": 4050, + "time_per_iteration": 3.010024309158325 + }, + { + "auxiliary_loss_clip": 0.01504944, + "auxiliary_loss_mlp": 0.01307264, + "balance_loss_clip": 1.14191926, + "balance_loss_mlp": 1.0461483, + "epoch": 0.4871039499789575, + "flos": 15123170208960.0, + "grad_norm": 2.2137430443697466, + "language_loss": 0.79852682, + "learning_rate": 2.180475589073227e-06, + "loss": 0.82664895, + "num_input_tokens_seen": 87203620, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.61132812, + "step": 4051, + "time_per_iteration": 2.960925340652466 + }, + { + "auxiliary_loss_clip": 0.0150288, + "auxiliary_loss_mlp": 0.01282004, + "balance_loss_clip": 1.13821483, + "balance_loss_mlp": 1.02718222, + "epoch": 0.48722419286959656, + "flos": 26176304985120.0, + "grad_norm": 1.6671715298403609, + "language_loss": 0.73485065, + "learning_rate": 2.1796997817676456e-06, + "loss": 0.76269948, + "num_input_tokens_seen": 87224630, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.546875, + "step": 4052, + "time_per_iteration": 3.122870445251465 + }, + { + "auxiliary_loss_clip": 0.01510211, + "auxiliary_loss_mlp": 0.01284415, + "balance_loss_clip": 1.14557517, + "balance_loss_mlp": 1.02711368, + "epoch": 0.4873444357602357, + "flos": 24028525573920.0, + "grad_norm": 3.8953075257541236, + "language_loss": 0.67636967, + "learning_rate": 2.1789239472017494e-06, + "loss": 0.7043159, + "num_input_tokens_seen": 87246280, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.57226562, + "step": 4053, + "time_per_iteration": 3.0839383602142334 + }, + { + "auxiliary_loss_clip": 0.0150785, + "auxiliary_loss_mlp": 0.0129231, + "balance_loss_clip": 1.14271331, + "balance_loss_mlp": 1.03291059, + "epoch": 0.4874646786508748, + "flos": 22822663560480.0, + "grad_norm": 2.0296876676624573, + "language_loss": 0.72762316, + "learning_rate": 2.1781480854932326e-06, + "loss": 0.75562477, + "num_input_tokens_seen": 87266045, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.59375, + "step": 4054, + "time_per_iteration": 3.7900142669677734 + }, + { + "auxiliary_loss_clip": 0.01502397, + "auxiliary_loss_mlp": 0.01286035, + "balance_loss_clip": 1.13837957, + "balance_loss_mlp": 1.02835274, + "epoch": 0.48758492154151384, + "flos": 21289705816320.0, + "grad_norm": 2.451382956881872, + "language_loss": 0.79576099, + "learning_rate": 2.1773721967597933e-06, + "loss": 0.82364535, + "num_input_tokens_seen": 87284495, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.57617188, + "step": 4055, + "time_per_iteration": 2.982553243637085 + }, + { + "auxiliary_loss_clip": 0.01514549, + "auxiliary_loss_mlp": 0.01204346, + "balance_loss_clip": 1.15587592, + "balance_loss_mlp": 0.99301147, + "epoch": 0.48770516443215295, + "flos": 62249606915040.0, + "grad_norm": 0.8516991716656457, + "language_loss": 0.57407403, + "learning_rate": 2.1765962811191322e-06, + "loss": 0.60126299, + "num_input_tokens_seen": 87338960, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.1171875, + "step": 4056, + "time_per_iteration": 4.132954835891724 + }, + { + "auxiliary_loss_clip": 0.01513569, + "auxiliary_loss_mlp": 0.01209518, + "balance_loss_clip": 1.15501547, + "balance_loss_mlp": 0.99971008, + "epoch": 0.48782540732279206, + "flos": 66140392042080.0, + "grad_norm": 0.8254079708040049, + "language_loss": 0.61959928, + "learning_rate": 2.1758203386889566e-06, + "loss": 0.64683014, + "num_input_tokens_seen": 87401730, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.1015625, + "step": 4057, + "time_per_iteration": 3.588226556777954 + }, + { + "auxiliary_loss_clip": 0.01514279, + "auxiliary_loss_mlp": 0.01298774, + "balance_loss_clip": 1.1508323, + "balance_loss_mlp": 1.03937423, + "epoch": 0.4879456502134311, + "flos": 14609504037600.0, + "grad_norm": 2.0822679222775218, + "language_loss": 0.84216952, + "learning_rate": 2.1750443695869746e-06, + "loss": 0.87030005, + "num_input_tokens_seen": 87417300, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.59375, + "step": 4058, + "time_per_iteration": 2.959521532058716 + }, + { + "auxiliary_loss_clip": 0.01501751, + "auxiliary_loss_mlp": 0.0128571, + "balance_loss_clip": 1.13837492, + "balance_loss_mlp": 1.02783656, + "epoch": 0.4880658931040702, + "flos": 19502513065440.0, + "grad_norm": 2.0932783916141755, + "language_loss": 0.86005133, + "learning_rate": 2.174268373930901e-06, + "loss": 0.88792592, + "num_input_tokens_seen": 87434815, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.578125, + "step": 4059, + "time_per_iteration": 2.988708257675171 + }, + { + "auxiliary_loss_clip": 0.01509132, + "auxiliary_loss_mlp": 0.01287817, + "balance_loss_clip": 1.14343309, + "balance_loss_mlp": 1.03108788, + "epoch": 0.48818613599470934, + "flos": 16724702795040.0, + "grad_norm": 2.1957561778602006, + "language_loss": 0.79848093, + "learning_rate": 2.1734923518384537e-06, + "loss": 0.82645035, + "num_input_tokens_seen": 87451420, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.56640625, + "step": 4060, + "time_per_iteration": 2.9775853157043457 + }, + { + "auxiliary_loss_clip": 0.01512267, + "auxiliary_loss_mlp": 0.01287298, + "balance_loss_clip": 1.14662147, + "balance_loss_mlp": 1.03171313, + "epoch": 0.4883063788853484, + "flos": 26759152848960.0, + "grad_norm": 2.6836776066550967, + "language_loss": 0.81978369, + "learning_rate": 2.1727163034273547e-06, + "loss": 0.84777933, + "num_input_tokens_seen": 87469585, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.5546875, + "step": 4061, + "time_per_iteration": 3.094592332839966 + }, + { + "auxiliary_loss_clip": 0.0150506, + "auxiliary_loss_mlp": 0.01300485, + "balance_loss_clip": 1.14086747, + "balance_loss_mlp": 1.04299247, + "epoch": 0.4884266217759875, + "flos": 16765248362400.0, + "grad_norm": 2.2552162661921056, + "language_loss": 0.7899195, + "learning_rate": 2.17194022881533e-06, + "loss": 0.81797493, + "num_input_tokens_seen": 87485675, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.57421875, + "step": 4062, + "time_per_iteration": 3.0668182373046875 + }, + { + "auxiliary_loss_clip": 0.01507938, + "auxiliary_loss_mlp": 0.01301343, + "balance_loss_clip": 1.14346874, + "balance_loss_mlp": 1.03908277, + "epoch": 0.4885468646666266, + "flos": 24209444718720.0, + "grad_norm": 2.4315504413899527, + "language_loss": 0.67584687, + "learning_rate": 2.1711641281201092e-06, + "loss": 0.7039398, + "num_input_tokens_seen": 87505605, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.62304688, + "step": 4063, + "time_per_iteration": 2.9687418937683105 + }, + { + "auxiliary_loss_clip": 0.01510889, + "auxiliary_loss_mlp": 0.01301497, + "balance_loss_clip": 1.14699125, + "balance_loss_mlp": 1.04495907, + "epoch": 0.48866710755726567, + "flos": 14612538290400.0, + "grad_norm": 2.402916764806744, + "language_loss": 0.79271913, + "learning_rate": 2.1703880014594264e-06, + "loss": 0.82084298, + "num_input_tokens_seen": 87523195, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.56445312, + "step": 4064, + "time_per_iteration": 2.969444751739502 + }, + { + "auxiliary_loss_clip": 0.0150891, + "auxiliary_loss_mlp": 0.01298882, + "balance_loss_clip": 1.14224577, + "balance_loss_mlp": 1.04310703, + "epoch": 0.4887873504479048, + "flos": 28807028393760.0, + "grad_norm": 3.7011679004967433, + "language_loss": 0.73730791, + "learning_rate": 2.1696118489510182e-06, + "loss": 0.76538587, + "num_input_tokens_seen": 87544125, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.55664062, + "step": 4065, + "time_per_iteration": 3.0475659370422363 + }, + { + "auxiliary_loss_clip": 0.01511467, + "auxiliary_loss_mlp": 0.01307144, + "balance_loss_clip": 1.14575076, + "balance_loss_mlp": 1.05098689, + "epoch": 0.48890759333854383, + "flos": 22786631444160.0, + "grad_norm": 1.854273580609521, + "language_loss": 0.72807693, + "learning_rate": 2.1688356707126286e-06, + "loss": 0.75626302, + "num_input_tokens_seen": 87563745, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.56054688, + "step": 4066, + "time_per_iteration": 3.051711082458496 + }, + { + "auxiliary_loss_clip": 0.0150886, + "auxiliary_loss_mlp": 0.01298513, + "balance_loss_clip": 1.14253235, + "balance_loss_mlp": 1.03930402, + "epoch": 0.48902783622918294, + "flos": 17788067254080.0, + "grad_norm": 2.2346183349140967, + "language_loss": 0.70099735, + "learning_rate": 2.168059466862001e-06, + "loss": 0.72907102, + "num_input_tokens_seen": 87581895, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.59179688, + "step": 4067, + "time_per_iteration": 3.072077751159668 + }, + { + "auxiliary_loss_clip": 0.01499389, + "auxiliary_loss_mlp": 0.01293897, + "balance_loss_clip": 1.13411212, + "balance_loss_mlp": 1.04022002, + "epoch": 0.48914807911982205, + "flos": 22312524708000.0, + "grad_norm": 4.619811966130618, + "language_loss": 0.81405437, + "learning_rate": 2.167283237516887e-06, + "loss": 0.84198725, + "num_input_tokens_seen": 87600170, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.53515625, + "step": 4068, + "time_per_iteration": 3.1591544151306152 + }, + { + "auxiliary_loss_clip": 0.01503962, + "auxiliary_loss_mlp": 0.01300382, + "balance_loss_clip": 1.13830376, + "balance_loss_mlp": 1.04308093, + "epoch": 0.4892683220104611, + "flos": 16364912626080.0, + "grad_norm": 1.914436487456857, + "language_loss": 0.74817765, + "learning_rate": 2.1665069827950383e-06, + "loss": 0.7762211, + "num_input_tokens_seen": 87617455, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.57226562, + "step": 4069, + "time_per_iteration": 3.1397669315338135 + }, + { + "auxiliary_loss_clip": 0.01515791, + "auxiliary_loss_mlp": 0.01284065, + "balance_loss_clip": 1.15097499, + "balance_loss_mlp": 1.02848053, + "epoch": 0.4893885649011002, + "flos": 15740798559840.0, + "grad_norm": 2.0754367132384375, + "language_loss": 0.86807954, + "learning_rate": 2.1657307028142126e-06, + "loss": 0.89607811, + "num_input_tokens_seen": 87634995, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.5546875, + "step": 4070, + "time_per_iteration": 3.080887794494629 + }, + { + "auxiliary_loss_clip": 0.01508497, + "auxiliary_loss_mlp": 0.01288187, + "balance_loss_clip": 1.14239371, + "balance_loss_mlp": 1.03050423, + "epoch": 0.48950880779173933, + "flos": 28584349980480.0, + "grad_norm": 1.9141646500267213, + "language_loss": 0.67156917, + "learning_rate": 2.164954397692171e-06, + "loss": 0.69953597, + "num_input_tokens_seen": 87654420, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.57617188, + "step": 4071, + "time_per_iteration": 3.109809160232544 + }, + { + "auxiliary_loss_clip": 0.01527423, + "auxiliary_loss_mlp": 0.01255547, + "balance_loss_clip": 1.16723049, + "balance_loss_mlp": 1.04497528, + "epoch": 0.4896290506823784, + "flos": 66192126780960.0, + "grad_norm": 1.108007602675727, + "language_loss": 0.77249157, + "learning_rate": 2.164178067546678e-06, + "loss": 0.80032134, + "num_input_tokens_seen": 87713585, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.109375, + "step": 4072, + "time_per_iteration": 3.578307867050171 + }, + { + "auxiliary_loss_clip": 0.01498453, + "auxiliary_loss_mlp": 0.01288116, + "balance_loss_clip": 1.1317122, + "balance_loss_mlp": 1.03253138, + "epoch": 0.4897492935730175, + "flos": 12532688942400.0, + "grad_norm": 2.3251850027500462, + "language_loss": 0.91304314, + "learning_rate": 2.163401712495504e-06, + "loss": 0.94090879, + "num_input_tokens_seen": 87731280, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.5546875, + "step": 4073, + "time_per_iteration": 3.0485241413116455 + }, + { + "auxiliary_loss_clip": 0.01504219, + "auxiliary_loss_mlp": 0.01288712, + "balance_loss_clip": 1.13738608, + "balance_loss_mlp": 1.03331816, + "epoch": 0.4898695364636566, + "flos": 23479054855200.0, + "grad_norm": 1.9732576064905698, + "language_loss": 0.79696965, + "learning_rate": 2.1626253326564194e-06, + "loss": 0.82489896, + "num_input_tokens_seen": 87750230, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.55273438, + "step": 4074, + "time_per_iteration": 3.0276010036468506 + }, + { + "auxiliary_loss_clip": 0.01507685, + "auxiliary_loss_mlp": 0.01281852, + "balance_loss_clip": 1.14078867, + "balance_loss_mlp": 1.02455103, + "epoch": 0.48998977935429566, + "flos": 27162560766240.0, + "grad_norm": 2.104790558359641, + "language_loss": 0.76870948, + "learning_rate": 2.161848928147201e-06, + "loss": 0.79660487, + "num_input_tokens_seen": 87770500, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.57226562, + "step": 4075, + "time_per_iteration": 3.0883073806762695 + }, + { + "auxiliary_loss_clip": 0.01509278, + "auxiliary_loss_mlp": 0.01276525, + "balance_loss_clip": 1.14353371, + "balance_loss_mlp": 1.01979601, + "epoch": 0.4901100222449348, + "flos": 20341681984800.0, + "grad_norm": 2.000931476480853, + "language_loss": 0.80872977, + "learning_rate": 2.161072499085629e-06, + "loss": 0.83658779, + "num_input_tokens_seen": 87789495, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.56640625, + "step": 4076, + "time_per_iteration": 3.8698928356170654 + }, + { + "auxiliary_loss_clip": 0.01512949, + "auxiliary_loss_mlp": 0.01281237, + "balance_loss_clip": 1.14643049, + "balance_loss_mlp": 1.02565265, + "epoch": 0.4902302651355739, + "flos": 30449030690880.0, + "grad_norm": 2.2725664060480995, + "language_loss": 0.82944262, + "learning_rate": 2.160296045589487e-06, + "loss": 0.85738444, + "num_input_tokens_seen": 87812955, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.5546875, + "step": 4077, + "time_per_iteration": 4.088701248168945 + }, + { + "auxiliary_loss_clip": 0.01505978, + "auxiliary_loss_mlp": 0.01288983, + "balance_loss_clip": 1.13898993, + "balance_loss_mlp": 1.03320777, + "epoch": 0.49035050802621294, + "flos": 19176327610560.0, + "grad_norm": 1.998289280583991, + "language_loss": 0.69778401, + "learning_rate": 2.159519567776562e-06, + "loss": 0.72573364, + "num_input_tokens_seen": 87832605, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.55664062, + "step": 4078, + "time_per_iteration": 3.0366480350494385 + }, + { + "auxiliary_loss_clip": 0.01510668, + "auxiliary_loss_mlp": 0.01282193, + "balance_loss_clip": 1.14573252, + "balance_loss_mlp": 1.02527356, + "epoch": 0.49047075091685205, + "flos": 22230219872160.0, + "grad_norm": 3.914165062318449, + "language_loss": 0.71015573, + "learning_rate": 2.1587430657646463e-06, + "loss": 0.73808438, + "num_input_tokens_seen": 87846040, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.56835938, + "step": 4079, + "time_per_iteration": 3.0403292179107666 + }, + { + "auxiliary_loss_clip": 0.01514158, + "auxiliary_loss_mlp": 0.01280618, + "balance_loss_clip": 1.15104294, + "balance_loss_mlp": 1.02293551, + "epoch": 0.4905909938074911, + "flos": 20158563006720.0, + "grad_norm": 1.7906571651558232, + "language_loss": 0.77867782, + "learning_rate": 2.157966539671533e-06, + "loss": 0.8066256, + "num_input_tokens_seen": 87865680, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.57617188, + "step": 4080, + "time_per_iteration": 3.8879096508026123 + }, + { + "auxiliary_loss_clip": 0.01512975, + "auxiliary_loss_mlp": 0.01288398, + "balance_loss_clip": 1.14880157, + "balance_loss_mlp": 1.0307157, + "epoch": 0.4907112366981302, + "flos": 17204157401760.0, + "grad_norm": 1.96021937679252, + "language_loss": 0.67153955, + "learning_rate": 2.157189989615021e-06, + "loss": 0.69955325, + "num_input_tokens_seen": 87884270, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.57617188, + "step": 4081, + "time_per_iteration": 3.025932788848877 + }, + { + "auxiliary_loss_clip": 0.01516075, + "auxiliary_loss_mlp": 0.01298085, + "balance_loss_clip": 1.15144861, + "balance_loss_mlp": 1.03735054, + "epoch": 0.4908314795887693, + "flos": 21691103541120.0, + "grad_norm": 1.9215124319470986, + "language_loss": 0.75097489, + "learning_rate": 2.156413415712913e-06, + "loss": 0.77911651, + "num_input_tokens_seen": 87906320, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.60742188, + "step": 4082, + "time_per_iteration": 3.0350916385650635 + }, + { + "auxiliary_loss_clip": 0.01510662, + "auxiliary_loss_mlp": 0.0128762, + "balance_loss_clip": 1.14618325, + "balance_loss_mlp": 1.02993751, + "epoch": 0.4909517224794084, + "flos": 26216091989280.0, + "grad_norm": 1.9238941177151105, + "language_loss": 0.78815627, + "learning_rate": 2.155636818083014e-06, + "loss": 0.81613904, + "num_input_tokens_seen": 87927690, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.57617188, + "step": 4083, + "time_per_iteration": 3.062420606613159 + }, + { + "auxiliary_loss_clip": 0.01512025, + "auxiliary_loss_mlp": 0.01293158, + "balance_loss_clip": 1.14663172, + "balance_loss_mlp": 1.03738213, + "epoch": 0.4910719653700475, + "flos": 23150593710720.0, + "grad_norm": 2.098125077356463, + "language_loss": 0.84421408, + "learning_rate": 2.154860196843134e-06, + "loss": 0.87226593, + "num_input_tokens_seen": 87946885, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.55664062, + "step": 4084, + "time_per_iteration": 3.946181297302246 + }, + { + "auxiliary_loss_clip": 0.01515485, + "auxiliary_loss_mlp": 0.01294485, + "balance_loss_clip": 1.15086889, + "balance_loss_mlp": 1.0350852, + "epoch": 0.4911922082606866, + "flos": 23334129898560.0, + "grad_norm": 1.8435708470821743, + "language_loss": 0.76753235, + "learning_rate": 2.154083552111085e-06, + "loss": 0.79563206, + "num_input_tokens_seen": 87966055, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.59375, + "step": 4085, + "time_per_iteration": 3.027324914932251 + }, + { + "auxiliary_loss_clip": 0.01514784, + "auxiliary_loss_mlp": 0.01289437, + "balance_loss_clip": 1.14895296, + "balance_loss_mlp": 1.03251696, + "epoch": 0.49131245115132566, + "flos": 29205695291040.0, + "grad_norm": 3.2837531149164794, + "language_loss": 0.81914771, + "learning_rate": 2.1533068840046834e-06, + "loss": 0.8471899, + "num_input_tokens_seen": 87986320, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.56835938, + "step": 4086, + "time_per_iteration": 3.179535388946533 + }, + { + "auxiliary_loss_clip": 0.01517159, + "auxiliary_loss_mlp": 0.01299405, + "balance_loss_clip": 1.15274251, + "balance_loss_mlp": 1.0409596, + "epoch": 0.49143269404196477, + "flos": 20149005110400.0, + "grad_norm": 2.888639503608082, + "language_loss": 0.61600327, + "learning_rate": 2.152530192641749e-06, + "loss": 0.64416885, + "num_input_tokens_seen": 88001230, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.58398438, + "step": 4087, + "time_per_iteration": 3.238496780395508 + }, + { + "auxiliary_loss_clip": 0.01512741, + "auxiliary_loss_mlp": 0.01279918, + "balance_loss_clip": 1.14611828, + "balance_loss_mlp": 1.02318835, + "epoch": 0.4915529369326039, + "flos": 24392146487040.0, + "grad_norm": 5.1643523500220025, + "language_loss": 0.72338367, + "learning_rate": 2.1517534781401068e-06, + "loss": 0.75131023, + "num_input_tokens_seen": 88019110, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.56640625, + "step": 4088, + "time_per_iteration": 3.221778154373169 + }, + { + "auxiliary_loss_clip": 0.01510654, + "auxiliary_loss_mlp": 0.01281093, + "balance_loss_clip": 1.1472913, + "balance_loss_mlp": 1.02169383, + "epoch": 0.49167317982324293, + "flos": 10525814102880.0, + "grad_norm": 2.6220904568652843, + "language_loss": 0.69194525, + "learning_rate": 2.150976740617581e-06, + "loss": 0.7198627, + "num_input_tokens_seen": 88035670, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.59375, + "step": 4089, + "time_per_iteration": 3.189934015274048 + }, + { + "auxiliary_loss_clip": 0.01518445, + "auxiliary_loss_mlp": 0.01290228, + "balance_loss_clip": 1.1522181, + "balance_loss_mlp": 1.03235483, + "epoch": 0.49179342271388204, + "flos": 25595808667200.0, + "grad_norm": 3.754628997529752, + "language_loss": 0.71445692, + "learning_rate": 2.150199980192006e-06, + "loss": 0.7425437, + "num_input_tokens_seen": 88054790, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.578125, + "step": 4090, + "time_per_iteration": 3.1342110633850098 + }, + { + "auxiliary_loss_clip": 0.0151495, + "auxiliary_loss_mlp": 0.01288188, + "balance_loss_clip": 1.14986944, + "balance_loss_mlp": 1.03203082, + "epoch": 0.49191366560452116, + "flos": 21103969795200.0, + "grad_norm": 1.6925545272164084, + "language_loss": 0.81123102, + "learning_rate": 2.1494231969812114e-06, + "loss": 0.83926237, + "num_input_tokens_seen": 88073780, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.56054688, + "step": 4091, + "time_per_iteration": 2.976454496383667 + }, + { + "auxiliary_loss_clip": 0.01519129, + "auxiliary_loss_mlp": 0.01283749, + "balance_loss_clip": 1.15496707, + "balance_loss_mlp": 1.02759194, + "epoch": 0.4920339084951602, + "flos": 26069915403360.0, + "grad_norm": 2.2929160639865804, + "language_loss": 0.81241512, + "learning_rate": 2.1486463911030372e-06, + "loss": 0.84044385, + "num_input_tokens_seen": 88094430, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.56054688, + "step": 4092, + "time_per_iteration": 3.0706787109375 + }, + { + "auxiliary_loss_clip": 0.01514496, + "auxiliary_loss_mlp": 0.01292158, + "balance_loss_clip": 1.1490674, + "balance_loss_mlp": 1.03447533, + "epoch": 0.4921541513857993, + "flos": 25083545837760.0, + "grad_norm": 5.9209541857699435, + "language_loss": 0.74784076, + "learning_rate": 2.147869562675324e-06, + "loss": 0.7759074, + "num_input_tokens_seen": 88113400, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.57617188, + "step": 4093, + "time_per_iteration": 3.128812789916992 + }, + { + "auxiliary_loss_clip": 0.01517026, + "auxiliary_loss_mlp": 0.01294195, + "balance_loss_clip": 1.15316439, + "balance_loss_mlp": 1.03727496, + "epoch": 0.49227439427643843, + "flos": 24392146487040.0, + "grad_norm": 2.902995820914112, + "language_loss": 0.72346604, + "learning_rate": 2.147092711815915e-06, + "loss": 0.75157833, + "num_input_tokens_seen": 88132750, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.56835938, + "step": 4094, + "time_per_iteration": 3.0018203258514404 + }, + { + "auxiliary_loss_clip": 0.01521119, + "auxiliary_loss_mlp": 0.01298285, + "balance_loss_clip": 1.15629447, + "balance_loss_mlp": 1.03812218, + "epoch": 0.4923946371670775, + "flos": 11365324375680.0, + "grad_norm": 2.7640263531923464, + "language_loss": 0.85692596, + "learning_rate": 2.1463158386426593e-06, + "loss": 0.88511997, + "num_input_tokens_seen": 88150560, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.6015625, + "step": 4095, + "time_per_iteration": 2.9662907123565674 + }, + { + "auxiliary_loss_clip": 0.01514955, + "auxiliary_loss_mlp": 0.01291294, + "balance_loss_clip": 1.14949834, + "balance_loss_mlp": 1.03113151, + "epoch": 0.4925148800577166, + "flos": 30448613481120.0, + "grad_norm": 2.073195080583639, + "language_loss": 0.77305067, + "learning_rate": 2.145538943273407e-06, + "loss": 0.80111313, + "num_input_tokens_seen": 88170835, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.6015625, + "step": 4096, + "time_per_iteration": 3.1212708950042725 + }, + { + "auxiliary_loss_clip": 0.01522034, + "auxiliary_loss_mlp": 0.01283532, + "balance_loss_clip": 1.157619, + "balance_loss_mlp": 1.02584887, + "epoch": 0.49263512294835565, + "flos": 20852579400480.0, + "grad_norm": 1.7818955611289902, + "language_loss": 0.71997678, + "learning_rate": 2.144762025826013e-06, + "loss": 0.74803245, + "num_input_tokens_seen": 88189925, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.57617188, + "step": 4097, + "time_per_iteration": 2.9924511909484863 + }, + { + "auxiliary_loss_clip": 0.01521746, + "auxiliary_loss_mlp": 0.01279541, + "balance_loss_clip": 1.15623498, + "balance_loss_mlp": 1.02166796, + "epoch": 0.49275536583899476, + "flos": 23769663331680.0, + "grad_norm": 2.7336978385084905, + "language_loss": 0.87211198, + "learning_rate": 2.143985086418334e-06, + "loss": 0.90012485, + "num_input_tokens_seen": 88205105, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.578125, + "step": 4098, + "time_per_iteration": 3.0487499237060547 + }, + { + "auxiliary_loss_clip": 0.01519077, + "auxiliary_loss_mlp": 0.01283394, + "balance_loss_clip": 1.1540544, + "balance_loss_mlp": 1.03162384, + "epoch": 0.4928756087296339, + "flos": 22275734028480.0, + "grad_norm": 1.5399480602254167, + "language_loss": 0.76648593, + "learning_rate": 2.1432081251682324e-06, + "loss": 0.79451066, + "num_input_tokens_seen": 88225475, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.515625, + "step": 4099, + "time_per_iteration": 3.066931962966919 + }, + { + "auxiliary_loss_clip": 0.01526157, + "auxiliary_loss_mlp": 0.01287663, + "balance_loss_clip": 1.16088748, + "balance_loss_mlp": 1.03131533, + "epoch": 0.49299585162027293, + "flos": 19647475950240.0, + "grad_norm": 1.8947170152473076, + "language_loss": 0.87110895, + "learning_rate": 2.142431142193572e-06, + "loss": 0.89924711, + "num_input_tokens_seen": 88243255, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.5625, + "step": 4100, + "time_per_iteration": 3.2069344520568848 + }, + { + "auxiliary_loss_clip": 0.01525258, + "auxiliary_loss_mlp": 0.01286266, + "balance_loss_clip": 1.16013622, + "balance_loss_mlp": 1.03010941, + "epoch": 0.49311609451091204, + "flos": 38840226818400.0, + "grad_norm": 1.9468230011967993, + "language_loss": 0.71499288, + "learning_rate": 2.1416541376122207e-06, + "loss": 0.74310815, + "num_input_tokens_seen": 88263435, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.56054688, + "step": 4101, + "time_per_iteration": 3.1567609310150146 + }, + { + "auxiliary_loss_clip": 0.01521337, + "auxiliary_loss_mlp": 0.01294913, + "balance_loss_clip": 1.15506911, + "balance_loss_mlp": 1.03436935, + "epoch": 0.49323633740155115, + "flos": 28331594172000.0, + "grad_norm": 1.9957806612076, + "language_loss": 0.7366848, + "learning_rate": 2.1408771115420496e-06, + "loss": 0.76484728, + "num_input_tokens_seen": 88283295, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.60546875, + "step": 4102, + "time_per_iteration": 3.1012091636657715 + }, + { + "auxiliary_loss_clip": 0.01531738, + "auxiliary_loss_mlp": 0.01279408, + "balance_loss_clip": 1.16875327, + "balance_loss_mlp": 1.02191591, + "epoch": 0.4933565802921902, + "flos": 21137346940320.0, + "grad_norm": 2.6928375331731513, + "language_loss": 0.6470446, + "learning_rate": 2.140100064100932e-06, + "loss": 0.67515606, + "num_input_tokens_seen": 88299270, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.57421875, + "step": 4103, + "time_per_iteration": 3.1099061965942383 + }, + { + "auxiliary_loss_clip": 0.01523039, + "auxiliary_loss_mlp": 0.01288026, + "balance_loss_clip": 1.15782058, + "balance_loss_mlp": 1.03072512, + "epoch": 0.4934768231828293, + "flos": 18040860990720.0, + "grad_norm": 2.1798895859096494, + "language_loss": 0.75788295, + "learning_rate": 2.139322995406746e-06, + "loss": 0.78599358, + "num_input_tokens_seen": 88316905, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.57226562, + "step": 4104, + "time_per_iteration": 4.782192945480347 + }, + { + "auxiliary_loss_clip": 0.0152607, + "auxiliary_loss_mlp": 0.01283826, + "balance_loss_clip": 1.16142559, + "balance_loss_mlp": 1.02805078, + "epoch": 0.4935970660734684, + "flos": 23471810576640.0, + "grad_norm": 1.8778667112400087, + "language_loss": 0.79690278, + "learning_rate": 2.1385459055773727e-06, + "loss": 0.82500172, + "num_input_tokens_seen": 88335095, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.55664062, + "step": 4105, + "time_per_iteration": 3.0392494201660156 + }, + { + "auxiliary_loss_clip": 0.01532593, + "auxiliary_loss_mlp": 0.01286848, + "balance_loss_clip": 1.16704965, + "balance_loss_mlp": 1.03164506, + "epoch": 0.4937173089641075, + "flos": 64483370557920.0, + "grad_norm": 2.943291905602903, + "language_loss": 0.7384187, + "learning_rate": 2.137768794730696e-06, + "loss": 0.76661313, + "num_input_tokens_seen": 88358545, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.55078125, + "step": 4106, + "time_per_iteration": 3.4340593814849854 + }, + { + "auxiliary_loss_clip": 0.0153042, + "auxiliary_loss_mlp": 0.01303994, + "balance_loss_clip": 1.16669762, + "balance_loss_mlp": 1.04669261, + "epoch": 0.4938375518547466, + "flos": 22348215470880.0, + "grad_norm": 1.9777170015201233, + "language_loss": 0.80266321, + "learning_rate": 2.1369916629846026e-06, + "loss": 0.83100736, + "num_input_tokens_seen": 88378295, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.57226562, + "step": 4107, + "time_per_iteration": 3.1262567043304443 + }, + { + "auxiliary_loss_clip": 0.01525736, + "auxiliary_loss_mlp": 0.01286393, + "balance_loss_clip": 1.15953231, + "balance_loss_mlp": 1.03099918, + "epoch": 0.4939577947453857, + "flos": 17860662480960.0, + "grad_norm": 2.4064289202814293, + "language_loss": 0.74953848, + "learning_rate": 2.136214510456983e-06, + "loss": 0.77765983, + "num_input_tokens_seen": 88396750, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 2.55273438, + "step": 4108, + "time_per_iteration": 3.883382797241211 + }, + { + "auxiliary_loss_clip": 0.01545227, + "auxiliary_loss_mlp": 0.01209641, + "balance_loss_clip": 1.18441284, + "balance_loss_mlp": 1.00212097, + "epoch": 0.49407803763602476, + "flos": 70073505724320.0, + "grad_norm": 0.8864622066686214, + "language_loss": 0.62979072, + "learning_rate": 2.1354373372657296e-06, + "loss": 0.65733945, + "num_input_tokens_seen": 88455190, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.078125, + "step": 4109, + "time_per_iteration": 3.5920612812042236 + }, + { + "auxiliary_loss_clip": 0.01532608, + "auxiliary_loss_mlp": 0.01287973, + "balance_loss_clip": 1.16795492, + "balance_loss_mlp": 1.03029013, + "epoch": 0.49419828052666387, + "flos": 24319854685440.0, + "grad_norm": 1.751433041098073, + "language_loss": 0.71017617, + "learning_rate": 2.1346601435287404e-06, + "loss": 0.73838198, + "num_input_tokens_seen": 88477460, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.57617188, + "step": 4110, + "time_per_iteration": 3.0660154819488525 + }, + { + "auxiliary_loss_clip": 0.01525372, + "auxiliary_loss_mlp": 0.01281607, + "balance_loss_clip": 1.16067994, + "balance_loss_mlp": 1.02773929, + "epoch": 0.494318523417303, + "flos": 29388548772000.0, + "grad_norm": 2.017264380535473, + "language_loss": 0.80614787, + "learning_rate": 2.1338829293639144e-06, + "loss": 0.83421767, + "num_input_tokens_seen": 88497820, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.53710938, + "step": 4111, + "time_per_iteration": 4.007000684738159 + }, + { + "auxiliary_loss_clip": 0.01526432, + "auxiliary_loss_mlp": 0.01296878, + "balance_loss_clip": 1.16211295, + "balance_loss_mlp": 1.03938568, + "epoch": 0.49443876630794203, + "flos": 15270105358080.0, + "grad_norm": 2.4776982093428472, + "language_loss": 0.82812452, + "learning_rate": 2.1331056948891547e-06, + "loss": 0.85635763, + "num_input_tokens_seen": 88514920, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.57421875, + "step": 4112, + "time_per_iteration": 2.9937212467193604 + }, + { + "auxiliary_loss_clip": 0.01527063, + "auxiliary_loss_mlp": 0.01280316, + "balance_loss_clip": 1.16412807, + "balance_loss_mlp": 1.02053523, + "epoch": 0.49455900919858115, + "flos": 12349001041920.0, + "grad_norm": 2.3140084667067, + "language_loss": 0.76274824, + "learning_rate": 2.1323284402223666e-06, + "loss": 0.79082203, + "num_input_tokens_seen": 88530910, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.59765625, + "step": 4113, + "time_per_iteration": 3.001410961151123 + }, + { + "auxiliary_loss_clip": 0.01535542, + "auxiliary_loss_mlp": 0.01289237, + "balance_loss_clip": 1.17050374, + "balance_loss_mlp": 1.03536916, + "epoch": 0.4946792520892202, + "flos": 22781586998880.0, + "grad_norm": 1.9294847881906763, + "language_loss": 0.88183701, + "learning_rate": 2.1315511654814597e-06, + "loss": 0.91008484, + "num_input_tokens_seen": 88549320, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.53710938, + "step": 4114, + "time_per_iteration": 2.966782331466675 + }, + { + "auxiliary_loss_clip": 0.01525589, + "auxiliary_loss_mlp": 0.01282888, + "balance_loss_clip": 1.16128206, + "balance_loss_mlp": 1.0271126, + "epoch": 0.4947994949798593, + "flos": 23150328213600.0, + "grad_norm": 2.0162046249771537, + "language_loss": 0.78620434, + "learning_rate": 2.1307738707843456e-06, + "loss": 0.81428909, + "num_input_tokens_seen": 88568985, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.55664062, + "step": 4115, + "time_per_iteration": 3.0824592113494873 + }, + { + "auxiliary_loss_clip": 0.01533273, + "auxiliary_loss_mlp": 0.01289101, + "balance_loss_clip": 1.16952431, + "balance_loss_mlp": 1.02951121, + "epoch": 0.4949197378704984, + "flos": 23662287617760.0, + "grad_norm": 2.417267794858328, + "language_loss": 0.687078, + "learning_rate": 2.1299965562489385e-06, + "loss": 0.71530175, + "num_input_tokens_seen": 88588790, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.59570312, + "step": 4116, + "time_per_iteration": 2.9689016342163086 + }, + { + "auxiliary_loss_clip": 0.01525778, + "auxiliary_loss_mlp": 0.01279084, + "balance_loss_clip": 1.16338873, + "balance_loss_mlp": 1.02216387, + "epoch": 0.4950399807611375, + "flos": 26914204624320.0, + "grad_norm": 1.7417699644287683, + "language_loss": 0.78828567, + "learning_rate": 2.129219221993158e-06, + "loss": 0.81633425, + "num_input_tokens_seen": 88613575, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.56835938, + "step": 4117, + "time_per_iteration": 3.0536322593688965 + }, + { + "auxiliary_loss_clip": 0.01544386, + "auxiliary_loss_mlp": 0.01209618, + "balance_loss_clip": 1.18451858, + "balance_loss_mlp": 1.00514984, + "epoch": 0.4951602236517766, + "flos": 67321031829120.0, + "grad_norm": 0.7841069843580499, + "language_loss": 0.59863985, + "learning_rate": 2.128441868134924e-06, + "loss": 0.62617981, + "num_input_tokens_seen": 88675510, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.046875, + "step": 4118, + "time_per_iteration": 3.5175082683563232 + }, + { + "auxiliary_loss_clip": 0.01525906, + "auxiliary_loss_mlp": 0.01290171, + "balance_loss_clip": 1.16064334, + "balance_loss_mlp": 1.03191566, + "epoch": 0.4952804665424157, + "flos": 19903493580480.0, + "grad_norm": 2.286009977757184, + "language_loss": 0.82184911, + "learning_rate": 2.1276644947921606e-06, + "loss": 0.85000992, + "num_input_tokens_seen": 88694425, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.58203125, + "step": 4119, + "time_per_iteration": 3.038395881652832 + }, + { + "auxiliary_loss_clip": 0.01520699, + "auxiliary_loss_mlp": 0.01290512, + "balance_loss_clip": 1.15565884, + "balance_loss_mlp": 1.03187537, + "epoch": 0.49540070943305475, + "flos": 18808800096960.0, + "grad_norm": 2.811333548098911, + "language_loss": 0.8223924, + "learning_rate": 2.126887102082795e-06, + "loss": 0.85050446, + "num_input_tokens_seen": 88714450, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.5859375, + "step": 4120, + "time_per_iteration": 3.1208977699279785 + }, + { + "auxiliary_loss_clip": 0.01523029, + "auxiliary_loss_mlp": 0.01282905, + "balance_loss_clip": 1.15751076, + "balance_loss_mlp": 1.025985, + "epoch": 0.49552095232369386, + "flos": 24936383119680.0, + "grad_norm": 1.7659070079181498, + "language_loss": 0.70578694, + "learning_rate": 2.126109690124757e-06, + "loss": 0.73384631, + "num_input_tokens_seen": 88735265, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.56835938, + "step": 4121, + "time_per_iteration": 3.1233270168304443 + }, + { + "auxiliary_loss_clip": 0.01521367, + "auxiliary_loss_mlp": 0.01292964, + "balance_loss_clip": 1.15437198, + "balance_loss_mlp": 1.03566265, + "epoch": 0.495641195214333, + "flos": 22859492168160.0, + "grad_norm": 2.9380628002093405, + "language_loss": 0.70802039, + "learning_rate": 2.1253322590359786e-06, + "loss": 0.73616374, + "num_input_tokens_seen": 88754600, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.57226562, + "step": 4122, + "time_per_iteration": 2.9844470024108887 + }, + { + "auxiliary_loss_clip": 0.01527555, + "auxiliary_loss_mlp": 0.01298784, + "balance_loss_clip": 1.16320705, + "balance_loss_mlp": 1.04033828, + "epoch": 0.49576143810497203, + "flos": 25771797151200.0, + "grad_norm": 5.434248916543181, + "language_loss": 0.74037075, + "learning_rate": 2.124554808934397e-06, + "loss": 0.76863408, + "num_input_tokens_seen": 88775180, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.58398438, + "step": 4123, + "time_per_iteration": 3.1282200813293457 + }, + { + "auxiliary_loss_clip": 0.01525778, + "auxiliary_loss_mlp": 0.01310984, + "balance_loss_clip": 1.16155577, + "balance_loss_mlp": 1.05215645, + "epoch": 0.49588168099561114, + "flos": 22131112497120.0, + "grad_norm": 2.060151814887012, + "language_loss": 0.7336002, + "learning_rate": 2.1237773399379496e-06, + "loss": 0.76196784, + "num_input_tokens_seen": 88796145, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.58789062, + "step": 4124, + "time_per_iteration": 3.0347626209259033 + }, + { + "auxiliary_loss_clip": 0.01515772, + "auxiliary_loss_mlp": 0.01300267, + "balance_loss_clip": 1.15066361, + "balance_loss_mlp": 1.04372859, + "epoch": 0.49600192388625025, + "flos": 24389529444000.0, + "grad_norm": 1.730782432192269, + "language_loss": 0.86913085, + "learning_rate": 2.122999852164578e-06, + "loss": 0.8972913, + "num_input_tokens_seen": 88816765, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.56445312, + "step": 4125, + "time_per_iteration": 3.064021110534668 + }, + { + "auxiliary_loss_clip": 0.01522269, + "auxiliary_loss_mlp": 0.01292612, + "balance_loss_clip": 1.15788937, + "balance_loss_mlp": 1.03225851, + "epoch": 0.4961221667768893, + "flos": 22859681808960.0, + "grad_norm": 4.323282257778854, + "language_loss": 0.58564103, + "learning_rate": 2.122222345732227e-06, + "loss": 0.6137898, + "num_input_tokens_seen": 88836680, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.60351562, + "step": 4126, + "time_per_iteration": 3.0270767211914062 + }, + { + "auxiliary_loss_clip": 0.01526688, + "auxiliary_loss_mlp": 0.01301717, + "balance_loss_clip": 1.16227996, + "balance_loss_mlp": 1.04117322, + "epoch": 0.4962424096675284, + "flos": 17860017702240.0, + "grad_norm": 2.0114960528367045, + "language_loss": 0.83151001, + "learning_rate": 2.121444820758843e-06, + "loss": 0.85979408, + "num_input_tokens_seen": 88855320, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.60546875, + "step": 4127, + "time_per_iteration": 3.012481212615967 + }, + { + "auxiliary_loss_clip": 0.01526696, + "auxiliary_loss_mlp": 0.01288649, + "balance_loss_clip": 1.16198301, + "balance_loss_mlp": 1.02867734, + "epoch": 0.49636265255816747, + "flos": 21795482930400.0, + "grad_norm": 2.838831654755803, + "language_loss": 0.78797948, + "learning_rate": 2.120667277362376e-06, + "loss": 0.81613296, + "num_input_tokens_seen": 88874035, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.59960938, + "step": 4128, + "time_per_iteration": 3.191532850265503 + }, + { + "auxiliary_loss_clip": 0.01526628, + "auxiliary_loss_mlp": 0.01291206, + "balance_loss_clip": 1.16433907, + "balance_loss_mlp": 1.03180623, + "epoch": 0.4964828954488066, + "flos": 16360247462400.0, + "grad_norm": 2.716480699751786, + "language_loss": 0.84986532, + "learning_rate": 2.1198897156607796e-06, + "loss": 0.87804371, + "num_input_tokens_seen": 88891390, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.59375, + "step": 4129, + "time_per_iteration": 3.1654012203216553 + }, + { + "auxiliary_loss_clip": 0.01527689, + "auxiliary_loss_mlp": 0.01288891, + "balance_loss_clip": 1.16558623, + "balance_loss_mlp": 1.02930069, + "epoch": 0.4966031383394457, + "flos": 24713059927680.0, + "grad_norm": 2.7679522546967403, + "language_loss": 0.73593688, + "learning_rate": 2.1191121357720085e-06, + "loss": 0.7641027, + "num_input_tokens_seen": 88909450, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.59570312, + "step": 4130, + "time_per_iteration": 2.9835870265960693 + }, + { + "auxiliary_loss_clip": 0.01527743, + "auxiliary_loss_mlp": 0.01286414, + "balance_loss_clip": 1.16517162, + "balance_loss_mlp": 1.02796865, + "epoch": 0.49672338123008475, + "flos": 22932883886400.0, + "grad_norm": 2.3533507387036736, + "language_loss": 0.74614298, + "learning_rate": 2.1183345378140206e-06, + "loss": 0.7742846, + "num_input_tokens_seen": 88929195, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.58398438, + "step": 4131, + "time_per_iteration": 3.0344090461730957 + }, + { + "auxiliary_loss_clip": 0.01550804, + "auxiliary_loss_mlp": 0.01225746, + "balance_loss_clip": 1.19266772, + "balance_loss_mlp": 1.01593781, + "epoch": 0.49684362412072386, + "flos": 65983064577120.0, + "grad_norm": 0.8699846435711843, + "language_loss": 0.61960655, + "learning_rate": 2.1175569219047783e-06, + "loss": 0.64737207, + "num_input_tokens_seen": 88990635, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.1015625, + "step": 4132, + "time_per_iteration": 5.304989814758301 + }, + { + "auxiliary_loss_clip": 0.01523067, + "auxiliary_loss_mlp": 0.01295286, + "balance_loss_clip": 1.16025078, + "balance_loss_mlp": 1.03817558, + "epoch": 0.49696386701136297, + "flos": 19973168339040.0, + "grad_norm": 1.8382800320068489, + "language_loss": 0.7348187, + "learning_rate": 2.1167792881622437e-06, + "loss": 0.76300222, + "num_input_tokens_seen": 89009655, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.5703125, + "step": 4133, + "time_per_iteration": 2.973283290863037 + }, + { + "auxiliary_loss_clip": 0.01525578, + "auxiliary_loss_mlp": 0.01284839, + "balance_loss_clip": 1.16278243, + "balance_loss_mlp": 1.02715611, + "epoch": 0.497084109902002, + "flos": 24752960716320.0, + "grad_norm": 2.1519712045237975, + "language_loss": 0.81050527, + "learning_rate": 2.116001636704384e-06, + "loss": 0.83860946, + "num_input_tokens_seen": 89030040, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.57617188, + "step": 4134, + "time_per_iteration": 3.192436456680298 + }, + { + "auxiliary_loss_clip": 0.01528568, + "auxiliary_loss_mlp": 0.01283619, + "balance_loss_clip": 1.16764688, + "balance_loss_mlp": 1.02574503, + "epoch": 0.49720435279264114, + "flos": 21873994950240.0, + "grad_norm": 3.6666124068762214, + "language_loss": 0.80253315, + "learning_rate": 2.1152239676491685e-06, + "loss": 0.83065498, + "num_input_tokens_seen": 89048145, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.578125, + "step": 4135, + "time_per_iteration": 2.993166208267212 + }, + { + "auxiliary_loss_clip": 0.01525568, + "auxiliary_loss_mlp": 0.0129111, + "balance_loss_clip": 1.16368699, + "balance_loss_mlp": 1.03647876, + "epoch": 0.49732459568328025, + "flos": 23808084922080.0, + "grad_norm": 1.7218399033537743, + "language_loss": 0.73295283, + "learning_rate": 2.114446281114569e-06, + "loss": 0.7611196, + "num_input_tokens_seen": 89067165, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.54492188, + "step": 4136, + "time_per_iteration": 3.7667291164398193 + }, + { + "auxiliary_loss_clip": 0.01527405, + "auxiliary_loss_mlp": 0.01290985, + "balance_loss_clip": 1.16549468, + "balance_loss_mlp": 1.03673518, + "epoch": 0.4974448385739193, + "flos": 20049821879040.0, + "grad_norm": 2.307299829244249, + "language_loss": 0.75981748, + "learning_rate": 2.1136685772185587e-06, + "loss": 0.7880013, + "num_input_tokens_seen": 89086190, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.54101562, + "step": 4137, + "time_per_iteration": 2.984257459640503 + }, + { + "auxiliary_loss_clip": 0.01521658, + "auxiliary_loss_mlp": 0.01284266, + "balance_loss_clip": 1.16055131, + "balance_loss_mlp": 1.02601135, + "epoch": 0.4975650814645584, + "flos": 24823356109920.0, + "grad_norm": 1.99132549878944, + "language_loss": 0.7757535, + "learning_rate": 2.1128908560791163e-06, + "loss": 0.80381274, + "num_input_tokens_seen": 89106020, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.58203125, + "step": 4138, + "time_per_iteration": 3.902585744857788 + }, + { + "auxiliary_loss_clip": 0.01530468, + "auxiliary_loss_mlp": 0.01285116, + "balance_loss_clip": 1.16904473, + "balance_loss_mlp": 1.02991259, + "epoch": 0.4976853243551975, + "flos": 19831581060480.0, + "grad_norm": 2.8358564629158205, + "language_loss": 0.78217769, + "learning_rate": 2.1121131178142203e-06, + "loss": 0.81033355, + "num_input_tokens_seen": 89125385, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.55078125, + "step": 4139, + "time_per_iteration": 3.0415079593658447 + }, + { + "auxiliary_loss_clip": 0.01518157, + "auxiliary_loss_mlp": 0.01290136, + "balance_loss_clip": 1.15567613, + "balance_loss_mlp": 1.03531456, + "epoch": 0.4978055672458366, + "flos": 23144904486720.0, + "grad_norm": 1.934359956465799, + "language_loss": 0.82552141, + "learning_rate": 2.1113353625418544e-06, + "loss": 0.85360432, + "num_input_tokens_seen": 89143935, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.546875, + "step": 4140, + "time_per_iteration": 3.0428102016448975 + }, + { + "auxiliary_loss_clip": 0.01531169, + "auxiliary_loss_mlp": 0.01281371, + "balance_loss_clip": 1.17051911, + "balance_loss_mlp": 1.02864718, + "epoch": 0.4979258101364757, + "flos": 15561737894880.0, + "grad_norm": 1.7322652246136987, + "language_loss": 0.79006267, + "learning_rate": 2.1105575903800017e-06, + "loss": 0.81818807, + "num_input_tokens_seen": 89162655, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.52539062, + "step": 4141, + "time_per_iteration": 3.0006415843963623 + }, + { + "auxiliary_loss_clip": 0.01521034, + "auxiliary_loss_mlp": 0.01286745, + "balance_loss_clip": 1.15920305, + "balance_loss_mlp": 1.02887106, + "epoch": 0.4980460530271148, + "flos": 26358399902880.0, + "grad_norm": 2.1226521280965254, + "language_loss": 0.85161471, + "learning_rate": 2.1097798014466502e-06, + "loss": 0.87969255, + "num_input_tokens_seen": 89182255, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.578125, + "step": 4142, + "time_per_iteration": 3.0219106674194336 + }, + { + "auxiliary_loss_clip": 0.01525367, + "auxiliary_loss_mlp": 0.01288719, + "balance_loss_clip": 1.16380262, + "balance_loss_mlp": 1.03275299, + "epoch": 0.49816629591775385, + "flos": 17276449203360.0, + "grad_norm": 5.185088676927434, + "language_loss": 0.59355038, + "learning_rate": 2.109001995859791e-06, + "loss": 0.62169123, + "num_input_tokens_seen": 89201155, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.55859375, + "step": 4143, + "time_per_iteration": 3.0776638984680176 + }, + { + "auxiliary_loss_clip": 0.01549547, + "auxiliary_loss_mlp": 0.01215851, + "balance_loss_clip": 1.19232202, + "balance_loss_mlp": 1.00985718, + "epoch": 0.49828653880839296, + "flos": 64937905634880.0, + "grad_norm": 0.8071301354672249, + "language_loss": 0.60017335, + "learning_rate": 2.108224173737415e-06, + "loss": 0.62782735, + "num_input_tokens_seen": 89264455, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.0625, + "step": 4144, + "time_per_iteration": 3.5546796321868896 + }, + { + "auxiliary_loss_clip": 0.01524332, + "auxiliary_loss_mlp": 0.0128403, + "balance_loss_clip": 1.16235602, + "balance_loss_mlp": 1.02787328, + "epoch": 0.498406781699032, + "flos": 27486963597600.0, + "grad_norm": 2.3554016879009287, + "language_loss": 0.76388162, + "learning_rate": 2.1074463351975183e-06, + "loss": 0.79196525, + "num_input_tokens_seen": 89283340, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.56054688, + "step": 4145, + "time_per_iteration": 3.0517985820770264 + }, + { + "auxiliary_loss_clip": 0.01521039, + "auxiliary_loss_mlp": 0.01285122, + "balance_loss_clip": 1.16009593, + "balance_loss_mlp": 1.03068161, + "epoch": 0.49852702458967113, + "flos": 31502344187520.0, + "grad_norm": 2.0727185476454473, + "language_loss": 0.7135098, + "learning_rate": 2.106668480358098e-06, + "loss": 0.74157143, + "num_input_tokens_seen": 89303565, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.54296875, + "step": 4146, + "time_per_iteration": 3.1046743392944336 + }, + { + "auxiliary_loss_clip": 0.01523936, + "auxiliary_loss_mlp": 0.01285197, + "balance_loss_clip": 1.16074121, + "balance_loss_mlp": 1.02694213, + "epoch": 0.49864726748031024, + "flos": 22854864932640.0, + "grad_norm": 1.9383484428053879, + "language_loss": 0.71040285, + "learning_rate": 2.105890609337154e-06, + "loss": 0.73849416, + "num_input_tokens_seen": 89322080, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.58203125, + "step": 4147, + "time_per_iteration": 2.966978073120117 + }, + { + "auxiliary_loss_clip": 0.01548406, + "auxiliary_loss_mlp": 0.01233871, + "balance_loss_clip": 1.19077063, + "balance_loss_mlp": 1.02482605, + "epoch": 0.4987675103709493, + "flos": 70413079819680.0, + "grad_norm": 0.7080902357295515, + "language_loss": 0.6379661, + "learning_rate": 2.1051127222526883e-06, + "loss": 0.66578889, + "num_input_tokens_seen": 89394195, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.09375, + "step": 4148, + "time_per_iteration": 3.4951510429382324 + }, + { + "auxiliary_loss_clip": 0.01524938, + "auxiliary_loss_mlp": 0.01279949, + "balance_loss_clip": 1.16518927, + "balance_loss_mlp": 1.02493668, + "epoch": 0.4988877532615884, + "flos": 28770123929760.0, + "grad_norm": 1.601468613979523, + "language_loss": 0.80695355, + "learning_rate": 2.1043348192227067e-06, + "loss": 0.83500236, + "num_input_tokens_seen": 89414565, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.54882812, + "step": 4149, + "time_per_iteration": 3.025871515274048 + }, + { + "auxiliary_loss_clip": 0.01523805, + "auxiliary_loss_mlp": 0.0128333, + "balance_loss_clip": 1.16238022, + "balance_loss_mlp": 1.02965283, + "epoch": 0.4990079961522275, + "flos": 16874558412480.0, + "grad_norm": 1.9489301033230308, + "language_loss": 0.62587249, + "learning_rate": 2.1035569003652156e-06, + "loss": 0.65394384, + "num_input_tokens_seen": 89433195, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.53515625, + "step": 4150, + "time_per_iteration": 2.9625396728515625 + }, + { + "auxiliary_loss_clip": 0.01525761, + "auxiliary_loss_mlp": 0.01288424, + "balance_loss_clip": 1.16486573, + "balance_loss_mlp": 1.0274986, + "epoch": 0.4991282390428666, + "flos": 13292359709760.0, + "grad_norm": 2.078511806631276, + "language_loss": 0.81966513, + "learning_rate": 2.1027789657982255e-06, + "loss": 0.84780699, + "num_input_tokens_seen": 89447410, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.609375, + "step": 4151, + "time_per_iteration": 3.001173734664917 + }, + { + "auxiliary_loss_clip": 0.0152135, + "auxiliary_loss_mlp": 0.01284259, + "balance_loss_clip": 1.15935636, + "balance_loss_mlp": 1.02752995, + "epoch": 0.4992484819335057, + "flos": 21539313587520.0, + "grad_norm": 2.384738096462556, + "language_loss": 0.77448553, + "learning_rate": 2.1020010156397482e-06, + "loss": 0.80254161, + "num_input_tokens_seen": 89464630, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.56640625, + "step": 4152, + "time_per_iteration": 3.019416570663452 + }, + { + "auxiliary_loss_clip": 0.01524035, + "auxiliary_loss_mlp": 0.01281927, + "balance_loss_clip": 1.16135108, + "balance_loss_mlp": 1.02576959, + "epoch": 0.4993687248241448, + "flos": 24862232838240.0, + "grad_norm": 1.74155417150704, + "language_loss": 0.77516359, + "learning_rate": 2.101223050007797e-06, + "loss": 0.80322313, + "num_input_tokens_seen": 89483180, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.56054688, + "step": 4153, + "time_per_iteration": 3.1060595512390137 + }, + { + "auxiliary_loss_clip": 0.01543347, + "auxiliary_loss_mlp": 0.01202446, + "balance_loss_clip": 1.1861999, + "balance_loss_mlp": 0.99568939, + "epoch": 0.49948896771478385, + "flos": 62947378196640.0, + "grad_norm": 0.8302988388030846, + "language_loss": 0.53793335, + "learning_rate": 2.1004450690203904e-06, + "loss": 0.5653913, + "num_input_tokens_seen": 89539260, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.0703125, + "step": 4154, + "time_per_iteration": 3.5978827476501465 + }, + { + "auxiliary_loss_clip": 0.01540808, + "auxiliary_loss_mlp": 0.01207283, + "balance_loss_clip": 1.18396425, + "balance_loss_mlp": 1.00128937, + "epoch": 0.49960921060542296, + "flos": 68291167777920.0, + "grad_norm": 0.859074517218668, + "language_loss": 0.63311899, + "learning_rate": 2.099667072795546e-06, + "loss": 0.66059989, + "num_input_tokens_seen": 89601380, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0625, + "step": 4155, + "time_per_iteration": 3.4998779296875 + }, + { + "auxiliary_loss_clip": 0.01517643, + "auxiliary_loss_mlp": 0.01274237, + "balance_loss_clip": 1.15603256, + "balance_loss_mlp": 1.01750755, + "epoch": 0.49972945349606207, + "flos": 23661604910880.0, + "grad_norm": 2.854741879851854, + "language_loss": 0.79706919, + "learning_rate": 2.0988890614512864e-06, + "loss": 0.82498801, + "num_input_tokens_seen": 89621270, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.56640625, + "step": 4156, + "time_per_iteration": 3.143038749694824 + }, + { + "auxiliary_loss_clip": 0.01523882, + "auxiliary_loss_mlp": 0.0127533, + "balance_loss_clip": 1.16249251, + "balance_loss_mlp": 1.01841021, + "epoch": 0.4998496963867011, + "flos": 19757885916960.0, + "grad_norm": 3.5897246479816296, + "language_loss": 0.84607965, + "learning_rate": 2.098111035105635e-06, + "loss": 0.87407178, + "num_input_tokens_seen": 89639695, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.56835938, + "step": 4157, + "time_per_iteration": 3.07340669631958 + }, + { + "auxiliary_loss_clip": 0.01519848, + "auxiliary_loss_mlp": 0.01288991, + "balance_loss_clip": 1.15718675, + "balance_loss_mlp": 1.02901959, + "epoch": 0.49996993927734024, + "flos": 22267503617760.0, + "grad_norm": 2.1857308003196687, + "language_loss": 0.73442215, + "learning_rate": 2.0973329938766176e-06, + "loss": 0.76251054, + "num_input_tokens_seen": 89657125, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.59960938, + "step": 4158, + "time_per_iteration": 3.0328783988952637 + }, + { + "auxiliary_loss_clip": 0.01526309, + "auxiliary_loss_mlp": 0.01295695, + "balance_loss_clip": 1.16568637, + "balance_loss_mlp": 1.03610468, + "epoch": 0.5000901821679793, + "flos": 23329237165920.0, + "grad_norm": 2.404061475267022, + "language_loss": 0.78981185, + "learning_rate": 2.0965549378822618e-06, + "loss": 0.81803191, + "num_input_tokens_seen": 89678415, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.59570312, + "step": 4159, + "time_per_iteration": 4.691409349441528 + }, + { + "auxiliary_loss_clip": 0.01518497, + "auxiliary_loss_mlp": 0.01297425, + "balance_loss_clip": 1.15648007, + "balance_loss_mlp": 1.04279363, + "epoch": 0.5002104250586185, + "flos": 20341075134240.0, + "grad_norm": 2.616062563803282, + "language_loss": 0.84483242, + "learning_rate": 2.095776867240599e-06, + "loss": 0.87299168, + "num_input_tokens_seen": 89695405, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.54492188, + "step": 4160, + "time_per_iteration": 3.0146870613098145 + }, + { + "auxiliary_loss_clip": 0.01519174, + "auxiliary_loss_mlp": 0.01274775, + "balance_loss_clip": 1.15729678, + "balance_loss_mlp": 1.02014399, + "epoch": 0.5003306679492575, + "flos": 13993127316000.0, + "grad_norm": 2.238906392902473, + "language_loss": 0.82353032, + "learning_rate": 2.094998782069661e-06, + "loss": 0.85146981, + "num_input_tokens_seen": 89713110, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.54492188, + "step": 4161, + "time_per_iteration": 3.024630546569824 + }, + { + "auxiliary_loss_clip": 0.0151947, + "auxiliary_loss_mlp": 0.01285386, + "balance_loss_clip": 1.15812671, + "balance_loss_mlp": 1.0292294, + "epoch": 0.5004509108398966, + "flos": 27675240805440.0, + "grad_norm": 1.9628318712035715, + "language_loss": 0.75573552, + "learning_rate": 2.0942206824874845e-06, + "loss": 0.78378415, + "num_input_tokens_seen": 89735885, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.56054688, + "step": 4162, + "time_per_iteration": 3.0510036945343018 + }, + { + "auxiliary_loss_clip": 0.01511109, + "auxiliary_loss_mlp": 0.01277205, + "balance_loss_clip": 1.14852846, + "balance_loss_mlp": 1.01971316, + "epoch": 0.5005711537305357, + "flos": 14977979755200.0, + "grad_norm": 2.1945990176321737, + "language_loss": 0.78828132, + "learning_rate": 2.093442568612105e-06, + "loss": 0.81616449, + "num_input_tokens_seen": 89753690, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.57421875, + "step": 4163, + "time_per_iteration": 3.907076835632324 + }, + { + "auxiliary_loss_clip": 0.01514822, + "auxiliary_loss_mlp": 0.0128389, + "balance_loss_clip": 1.15276635, + "balance_loss_mlp": 1.02601624, + "epoch": 0.5006913966211748, + "flos": 26505562620960.0, + "grad_norm": 1.6469036856081527, + "language_loss": 0.85511744, + "learning_rate": 2.0926644405615613e-06, + "loss": 0.88310456, + "num_input_tokens_seen": 89774590, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.578125, + "step": 4164, + "time_per_iteration": 3.115511894226074 + }, + { + "auxiliary_loss_clip": 0.01512231, + "auxiliary_loss_mlp": 0.01281724, + "balance_loss_clip": 1.15015578, + "balance_loss_mlp": 1.0267117, + "epoch": 0.5008116395118138, + "flos": 20451409244640.0, + "grad_norm": 2.95729902810775, + "language_loss": 0.81172562, + "learning_rate": 2.091886298453897e-06, + "loss": 0.83966517, + "num_input_tokens_seen": 89792775, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.54882812, + "step": 4165, + "time_per_iteration": 3.9837231636047363 + }, + { + "auxiliary_loss_clip": 0.01518316, + "auxiliary_loss_mlp": 0.01285865, + "balance_loss_clip": 1.15711188, + "balance_loss_mlp": 1.02951765, + "epoch": 0.500931882402453, + "flos": 21582021060000.0, + "grad_norm": 2.260110251670363, + "language_loss": 0.72703356, + "learning_rate": 2.091108142407153e-06, + "loss": 0.7550754, + "num_input_tokens_seen": 89811515, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.5625, + "step": 4166, + "time_per_iteration": 3.144526720046997 + }, + { + "auxiliary_loss_clip": 0.01529082, + "auxiliary_loss_mlp": 0.01249397, + "balance_loss_clip": 1.17118609, + "balance_loss_mlp": 1.04187775, + "epoch": 0.5010521252930921, + "flos": 57791865458880.0, + "grad_norm": 0.858859589932259, + "language_loss": 0.62375057, + "learning_rate": 2.090329972539377e-06, + "loss": 0.65153527, + "num_input_tokens_seen": 89870080, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.078125, + "step": 4167, + "time_per_iteration": 3.5342209339141846 + }, + { + "auxiliary_loss_clip": 0.01512077, + "auxiliary_loss_mlp": 0.01285959, + "balance_loss_clip": 1.15048623, + "balance_loss_mlp": 1.0313282, + "epoch": 0.5011723681837311, + "flos": 18627539598720.0, + "grad_norm": 2.164515592508442, + "language_loss": 0.68385845, + "learning_rate": 2.089551788968616e-06, + "loss": 0.71183884, + "num_input_tokens_seen": 89888045, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.54492188, + "step": 4168, + "time_per_iteration": 3.1497297286987305 + }, + { + "auxiliary_loss_clip": 0.0152613, + "auxiliary_loss_mlp": 0.01230568, + "balance_loss_clip": 1.16790152, + "balance_loss_mlp": 1.02152252, + "epoch": 0.5012926110743702, + "flos": 55889635505760.0, + "grad_norm": 0.8422412631596397, + "language_loss": 0.60762656, + "learning_rate": 2.08877359181292e-06, + "loss": 0.63519359, + "num_input_tokens_seen": 89944610, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.09375, + "step": 4169, + "time_per_iteration": 3.3957293033599854 + }, + { + "auxiliary_loss_clip": 0.01506417, + "auxiliary_loss_mlp": 0.0128515, + "balance_loss_clip": 1.14298081, + "balance_loss_mlp": 1.02956581, + "epoch": 0.5014128539650093, + "flos": 24240280677120.0, + "grad_norm": 3.4506658301203985, + "language_loss": 0.85791898, + "learning_rate": 2.0879953811903396e-06, + "loss": 0.88583469, + "num_input_tokens_seen": 89959495, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.5546875, + "step": 4170, + "time_per_iteration": 3.0496881008148193 + }, + { + "auxiliary_loss_clip": 0.01509664, + "auxiliary_loss_mlp": 0.01287808, + "balance_loss_clip": 1.1466974, + "balance_loss_mlp": 1.03069758, + "epoch": 0.5015330968556484, + "flos": 27529974495360.0, + "grad_norm": 1.9663412930667339, + "language_loss": 0.78986645, + "learning_rate": 2.08721715721893e-06, + "loss": 0.81784117, + "num_input_tokens_seen": 89978820, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.5703125, + "step": 4171, + "time_per_iteration": 3.0987770557403564 + }, + { + "auxiliary_loss_clip": 0.01510335, + "auxiliary_loss_mlp": 0.01276202, + "balance_loss_clip": 1.14857078, + "balance_loss_mlp": 1.02080846, + "epoch": 0.5016533397462875, + "flos": 23802812907840.0, + "grad_norm": 1.9342330969038735, + "language_loss": 0.76708478, + "learning_rate": 2.0864389200167477e-06, + "loss": 0.79495013, + "num_input_tokens_seen": 89997075, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.55273438, + "step": 4172, + "time_per_iteration": 3.169698715209961 + }, + { + "auxiliary_loss_clip": 0.01514492, + "auxiliary_loss_mlp": 0.01272574, + "balance_loss_clip": 1.15187001, + "balance_loss_mlp": 1.01717997, + "epoch": 0.5017735826369266, + "flos": 25297007708160.0, + "grad_norm": 1.7451468790633924, + "language_loss": 0.78639376, + "learning_rate": 2.0856606697018504e-06, + "loss": 0.81426442, + "num_input_tokens_seen": 90015085, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.55273438, + "step": 4173, + "time_per_iteration": 2.9699153900146484 + }, + { + "auxiliary_loss_clip": 0.01509156, + "auxiliary_loss_mlp": 0.01278275, + "balance_loss_clip": 1.14654398, + "balance_loss_mlp": 1.020401, + "epoch": 0.5018938255275657, + "flos": 16875089406720.0, + "grad_norm": 2.3197759354492944, + "language_loss": 0.73157817, + "learning_rate": 2.084882406392297e-06, + "loss": 0.75945246, + "num_input_tokens_seen": 90033045, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.578125, + "step": 4174, + "time_per_iteration": 3.007636070251465 + }, + { + "auxiliary_loss_clip": 0.01514036, + "auxiliary_loss_mlp": 0.01279544, + "balance_loss_clip": 1.15191698, + "balance_loss_mlp": 1.02338672, + "epoch": 0.5020140684182047, + "flos": 25517524216320.0, + "grad_norm": 3.9545712274166607, + "language_loss": 0.7169202, + "learning_rate": 2.0841041302061496e-06, + "loss": 0.744856, + "num_input_tokens_seen": 90052505, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.56054688, + "step": 4175, + "time_per_iteration": 3.0951623916625977 + }, + { + "auxiliary_loss_clip": 0.01517216, + "auxiliary_loss_mlp": 0.0128076, + "balance_loss_clip": 1.15497875, + "balance_loss_mlp": 1.02441251, + "epoch": 0.5021343113088439, + "flos": 23661529054560.0, + "grad_norm": 2.2602737227510263, + "language_loss": 0.75865561, + "learning_rate": 2.083325841261473e-06, + "loss": 0.78663534, + "num_input_tokens_seen": 90071565, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.5625, + "step": 4176, + "time_per_iteration": 2.9888923168182373 + }, + { + "auxiliary_loss_clip": 0.01515151, + "auxiliary_loss_mlp": 0.01282008, + "balance_loss_clip": 1.15203893, + "balance_loss_mlp": 1.02508783, + "epoch": 0.502254554199483, + "flos": 24536957659200.0, + "grad_norm": 2.447068751103309, + "language_loss": 0.6610285, + "learning_rate": 2.0825475396763322e-06, + "loss": 0.68900013, + "num_input_tokens_seen": 90092215, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.56835938, + "step": 4177, + "time_per_iteration": 3.124542236328125 + }, + { + "auxiliary_loss_clip": 0.01518388, + "auxiliary_loss_mlp": 0.01278632, + "balance_loss_clip": 1.15694404, + "balance_loss_mlp": 1.02304697, + "epoch": 0.502374797090122, + "flos": 34243136209440.0, + "grad_norm": 1.5046144499628038, + "language_loss": 0.65700614, + "learning_rate": 2.081769225568796e-06, + "loss": 0.68497634, + "num_input_tokens_seen": 90114665, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.5546875, + "step": 4178, + "time_per_iteration": 3.178741931915283 + }, + { + "auxiliary_loss_clip": 0.01512757, + "auxiliary_loss_mlp": 0.01283242, + "balance_loss_clip": 1.15145564, + "balance_loss_mlp": 1.02899253, + "epoch": 0.5024950399807612, + "flos": 26033504005440.0, + "grad_norm": 1.6024952885200154, + "language_loss": 0.75999475, + "learning_rate": 2.0809908990569327e-06, + "loss": 0.78795481, + "num_input_tokens_seen": 90136445, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.54101562, + "step": 4179, + "time_per_iteration": 2.9856467247009277 + }, + { + "auxiliary_loss_clip": 0.01516399, + "auxiliary_loss_mlp": 0.012819, + "balance_loss_clip": 1.15348554, + "balance_loss_mlp": 1.02707815, + "epoch": 0.5026152828714002, + "flos": 21254621904000.0, + "grad_norm": 1.9051067576605663, + "language_loss": 0.79087961, + "learning_rate": 2.0802125602588146e-06, + "loss": 0.81886256, + "num_input_tokens_seen": 90155710, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.546875, + "step": 4180, + "time_per_iteration": 3.012150526046753 + }, + { + "auxiliary_loss_clip": 0.01516155, + "auxiliary_loss_mlp": 0.012717, + "balance_loss_clip": 1.15365839, + "balance_loss_mlp": 1.01745069, + "epoch": 0.5027355257620393, + "flos": 30958752333600.0, + "grad_norm": 2.0245343482574167, + "language_loss": 0.66558969, + "learning_rate": 2.0794342092925146e-06, + "loss": 0.69346827, + "num_input_tokens_seen": 90176845, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.54101562, + "step": 4181, + "time_per_iteration": 3.0401663780212402 + }, + { + "auxiliary_loss_clip": 0.01524496, + "auxiliary_loss_mlp": 0.01286563, + "balance_loss_clip": 1.16294539, + "balance_loss_mlp": 1.03135955, + "epoch": 0.5028557686526784, + "flos": 24793506283680.0, + "grad_norm": 2.6164955677615507, + "language_loss": 0.68070602, + "learning_rate": 2.078655846276108e-06, + "loss": 0.70881665, + "num_input_tokens_seen": 90197175, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.55078125, + "step": 4182, + "time_per_iteration": 3.105764150619507 + }, + { + "auxiliary_loss_clip": 0.01519522, + "auxiliary_loss_mlp": 0.01274839, + "balance_loss_clip": 1.15764534, + "balance_loss_mlp": 1.01925421, + "epoch": 0.5029760115433175, + "flos": 22969257356160.0, + "grad_norm": 2.3668556867944384, + "language_loss": 0.69191885, + "learning_rate": 2.0778774713276727e-06, + "loss": 0.71986246, + "num_input_tokens_seen": 90216650, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.5546875, + "step": 4183, + "time_per_iteration": 3.013517379760742 + }, + { + "auxiliary_loss_clip": 0.01516694, + "auxiliary_loss_mlp": 0.0127181, + "balance_loss_clip": 1.15466523, + "balance_loss_mlp": 1.0162251, + "epoch": 0.5030962544339566, + "flos": 15307009822080.0, + "grad_norm": 2.486440752402008, + "language_loss": 0.67964399, + "learning_rate": 2.077099084565287e-06, + "loss": 0.70752901, + "num_input_tokens_seen": 90234055, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.5546875, + "step": 4184, + "time_per_iteration": 3.013040781021118 + }, + { + "auxiliary_loss_clip": 0.01514875, + "auxiliary_loss_mlp": 0.01280658, + "balance_loss_clip": 1.15119815, + "balance_loss_mlp": 1.02431035, + "epoch": 0.5032164973245957, + "flos": 24496943086080.0, + "grad_norm": 2.304370267669134, + "language_loss": 0.64875817, + "learning_rate": 2.0763206861070313e-06, + "loss": 0.67671359, + "num_input_tokens_seen": 90253115, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.5625, + "step": 4185, + "time_per_iteration": 3.0203769207000732 + }, + { + "auxiliary_loss_clip": 0.01514796, + "auxiliary_loss_mlp": 0.01286443, + "balance_loss_clip": 1.15174603, + "balance_loss_mlp": 1.03162086, + "epoch": 0.5033367402152348, + "flos": 16215132864960.0, + "grad_norm": 2.29668848994224, + "language_loss": 0.75314957, + "learning_rate": 2.0755422760709876e-06, + "loss": 0.7811619, + "num_input_tokens_seen": 90270515, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.546875, + "step": 4186, + "time_per_iteration": 3.843090057373047 + }, + { + "auxiliary_loss_clip": 0.01520902, + "auxiliary_loss_mlp": 0.01283463, + "balance_loss_clip": 1.15760469, + "balance_loss_mlp": 1.02826011, + "epoch": 0.5034569831058738, + "flos": 21393174929760.0, + "grad_norm": 2.6382129128081693, + "language_loss": 0.77081078, + "learning_rate": 2.0747638545752417e-06, + "loss": 0.79885447, + "num_input_tokens_seen": 90289075, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.55078125, + "step": 4187, + "time_per_iteration": 2.9210424423217773 + }, + { + "auxiliary_loss_clip": 0.01520901, + "auxiliary_loss_mlp": 0.012905, + "balance_loss_clip": 1.1585933, + "balance_loss_mlp": 1.0345341, + "epoch": 0.503577225996513, + "flos": 20560871007360.0, + "grad_norm": 2.0387676272995017, + "language_loss": 0.83357918, + "learning_rate": 2.073985421737878e-06, + "loss": 0.86169314, + "num_input_tokens_seen": 90306385, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.55859375, + "step": 4188, + "time_per_iteration": 3.0081348419189453 + }, + { + "auxiliary_loss_clip": 0.01517842, + "auxiliary_loss_mlp": 0.01287137, + "balance_loss_clip": 1.1551863, + "balance_loss_mlp": 1.02850044, + "epoch": 0.5036974688871521, + "flos": 27231287320800.0, + "grad_norm": 2.5690759060670887, + "language_loss": 0.73593736, + "learning_rate": 2.0732069776769844e-06, + "loss": 0.76398712, + "num_input_tokens_seen": 90323795, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.5859375, + "step": 4189, + "time_per_iteration": 2.9643943309783936 + }, + { + "auxiliary_loss_clip": 0.01527965, + "auxiliary_loss_mlp": 0.01296346, + "balance_loss_clip": 1.16515338, + "balance_loss_mlp": 1.03828132, + "epoch": 0.5038177117777911, + "flos": 20414239283520.0, + "grad_norm": 2.5862214043473535, + "language_loss": 0.73174584, + "learning_rate": 2.072428522510651e-06, + "loss": 0.75998896, + "num_input_tokens_seen": 90340360, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.58007812, + "step": 4190, + "time_per_iteration": 3.8029887676239014 + }, + { + "auxiliary_loss_clip": 0.01526496, + "auxiliary_loss_mlp": 0.01279336, + "balance_loss_clip": 1.16622484, + "balance_loss_mlp": 1.02031791, + "epoch": 0.5039379546684303, + "flos": 21910140851040.0, + "grad_norm": 2.322402253344117, + "language_loss": 0.75999677, + "learning_rate": 2.071650056356968e-06, + "loss": 0.78805518, + "num_input_tokens_seen": 90357900, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.58984375, + "step": 4191, + "time_per_iteration": 3.009456157684326 + }, + { + "auxiliary_loss_clip": 0.0152541, + "auxiliary_loss_mlp": 0.01286777, + "balance_loss_clip": 1.16507983, + "balance_loss_mlp": 1.03023839, + "epoch": 0.5040581975590693, + "flos": 20012765702400.0, + "grad_norm": 2.0095136970312524, + "language_loss": 0.8007496, + "learning_rate": 2.070871579334028e-06, + "loss": 0.82887143, + "num_input_tokens_seen": 90377010, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.56445312, + "step": 4192, + "time_per_iteration": 2.907647132873535 + }, + { + "auxiliary_loss_clip": 0.01523367, + "auxiliary_loss_mlp": 0.01298614, + "balance_loss_clip": 1.16009974, + "balance_loss_mlp": 1.03845143, + "epoch": 0.5041784404497084, + "flos": 20961965306880.0, + "grad_norm": 1.9559819532167537, + "language_loss": 0.71777344, + "learning_rate": 2.0700930915599264e-06, + "loss": 0.74599314, + "num_input_tokens_seen": 90396740, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.6015625, + "step": 4193, + "time_per_iteration": 3.827291965484619 + }, + { + "auxiliary_loss_clip": 0.01523354, + "auxiliary_loss_mlp": 0.01288041, + "balance_loss_clip": 1.1602304, + "balance_loss_mlp": 1.03112149, + "epoch": 0.5042986833403476, + "flos": 12496732682400.0, + "grad_norm": 2.5216307130642117, + "language_loss": 0.78585035, + "learning_rate": 2.0693145931527583e-06, + "loss": 0.81396431, + "num_input_tokens_seen": 90413220, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.56835938, + "step": 4194, + "time_per_iteration": 2.9277896881103516 + }, + { + "auxiliary_loss_clip": 0.01522404, + "auxiliary_loss_mlp": 0.01282815, + "balance_loss_clip": 1.159958, + "balance_loss_mlp": 1.02551317, + "epoch": 0.5044189262309866, + "flos": 29204709158880.0, + "grad_norm": 1.9671018110054006, + "language_loss": 0.77674729, + "learning_rate": 2.068536084230622e-06, + "loss": 0.80479944, + "num_input_tokens_seen": 90435085, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.57226562, + "step": 4195, + "time_per_iteration": 3.0939342975616455 + }, + { + "auxiliary_loss_clip": 0.01526702, + "auxiliary_loss_mlp": 0.01292717, + "balance_loss_clip": 1.16597652, + "balance_loss_mlp": 1.0329361, + "epoch": 0.5045391691216257, + "flos": 23875180565760.0, + "grad_norm": 2.1595293251465906, + "language_loss": 0.88597429, + "learning_rate": 2.067757564911616e-06, + "loss": 0.91416848, + "num_input_tokens_seen": 90453660, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.59765625, + "step": 4196, + "time_per_iteration": 2.982295036315918 + }, + { + "auxiliary_loss_clip": 0.01526204, + "auxiliary_loss_mlp": 0.01295248, + "balance_loss_clip": 1.16463482, + "balance_loss_mlp": 1.0371834, + "epoch": 0.5046594120122648, + "flos": 24647329697760.0, + "grad_norm": 2.339024747542396, + "language_loss": 0.92862189, + "learning_rate": 2.0669790353138407e-06, + "loss": 0.9568364, + "num_input_tokens_seen": 90472625, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.58007812, + "step": 4197, + "time_per_iteration": 2.9370346069335938 + }, + { + "auxiliary_loss_clip": 0.01522021, + "auxiliary_loss_mlp": 0.01287156, + "balance_loss_clip": 1.15886521, + "balance_loss_mlp": 1.02966428, + "epoch": 0.5047796549029039, + "flos": 23364814144320.0, + "grad_norm": 6.334377216875856, + "language_loss": 0.73235452, + "learning_rate": 2.0662004955553995e-06, + "loss": 0.76044631, + "num_input_tokens_seen": 90492325, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.57421875, + "step": 4198, + "time_per_iteration": 3.059032678604126 + }, + { + "auxiliary_loss_clip": 0.01525061, + "auxiliary_loss_mlp": 0.01295416, + "balance_loss_clip": 1.1634438, + "balance_loss_mlp": 1.03849649, + "epoch": 0.5048998977935429, + "flos": 17305616322720.0, + "grad_norm": 2.345621001875019, + "language_loss": 0.77105653, + "learning_rate": 2.065421945754395e-06, + "loss": 0.79926133, + "num_input_tokens_seen": 90510055, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.56835938, + "step": 4199, + "time_per_iteration": 2.9702608585357666 + }, + { + "auxiliary_loss_clip": 0.01520532, + "auxiliary_loss_mlp": 0.01299152, + "balance_loss_clip": 1.15718055, + "balance_loss_mlp": 1.04204118, + "epoch": 0.505020140684182, + "flos": 34859664643680.0, + "grad_norm": 2.000016984924329, + "language_loss": 0.77988291, + "learning_rate": 2.0646433860289344e-06, + "loss": 0.80807972, + "num_input_tokens_seen": 90528980, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.5703125, + "step": 4200, + "time_per_iteration": 3.081981897354126 + }, + { + "auxiliary_loss_clip": 0.01521304, + "auxiliary_loss_mlp": 0.0128533, + "balance_loss_clip": 1.15891099, + "balance_loss_mlp": 1.02707505, + "epoch": 0.5051403835748212, + "flos": 24866329079520.0, + "grad_norm": 2.0054793557281294, + "language_loss": 0.82679641, + "learning_rate": 2.0638648164971233e-06, + "loss": 0.85486275, + "num_input_tokens_seen": 90547445, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.58203125, + "step": 4201, + "time_per_iteration": 3.0818827152252197 + }, + { + "auxiliary_loss_clip": 0.01523454, + "auxiliary_loss_mlp": 0.01283548, + "balance_loss_clip": 1.16129303, + "balance_loss_mlp": 1.02758217, + "epoch": 0.5052606264654602, + "flos": 20961813594240.0, + "grad_norm": 3.0739681670424885, + "language_loss": 0.88903284, + "learning_rate": 2.06308623727707e-06, + "loss": 0.91710281, + "num_input_tokens_seen": 90567545, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.55859375, + "step": 4202, + "time_per_iteration": 3.026524543762207 + }, + { + "auxiliary_loss_clip": 0.01526099, + "auxiliary_loss_mlp": 0.01287146, + "balance_loss_clip": 1.16513371, + "balance_loss_mlp": 1.0290817, + "epoch": 0.5053808693560993, + "flos": 19644214128480.0, + "grad_norm": 2.3847473211101478, + "language_loss": 0.76704919, + "learning_rate": 2.0623076484868846e-06, + "loss": 0.79518163, + "num_input_tokens_seen": 90585000, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.58007812, + "step": 4203, + "time_per_iteration": 2.947070360183716 + }, + { + "auxiliary_loss_clip": 0.01535829, + "auxiliary_loss_mlp": 0.01205109, + "balance_loss_clip": 1.1788727, + "balance_loss_mlp": 0.99606323, + "epoch": 0.5055011122467384, + "flos": 67511091660480.0, + "grad_norm": 0.8478049949298598, + "language_loss": 0.60689318, + "learning_rate": 2.061529050244679e-06, + "loss": 0.63430262, + "num_input_tokens_seen": 90644745, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.09375, + "step": 4204, + "time_per_iteration": 3.4066121578216553 + }, + { + "auxiliary_loss_clip": 0.01520228, + "auxiliary_loss_mlp": 0.0128528, + "balance_loss_clip": 1.15952158, + "balance_loss_mlp": 1.02931404, + "epoch": 0.5056213551373775, + "flos": 16874823909600.0, + "grad_norm": 2.993963156112308, + "language_loss": 0.74060273, + "learning_rate": 2.060750442668565e-06, + "loss": 0.7686578, + "num_input_tokens_seen": 90662500, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.55859375, + "step": 4205, + "time_per_iteration": 2.913095712661743 + }, + { + "auxiliary_loss_clip": 0.01531644, + "auxiliary_loss_mlp": 0.0129308, + "balance_loss_clip": 1.17031288, + "balance_loss_mlp": 1.03577888, + "epoch": 0.5057415980280165, + "flos": 15335608019040.0, + "grad_norm": 5.124098152606532, + "language_loss": 0.63898075, + "learning_rate": 2.059971825876657e-06, + "loss": 0.66722798, + "num_input_tokens_seen": 90677010, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.57226562, + "step": 4206, + "time_per_iteration": 2.9212512969970703 + }, + { + "auxiliary_loss_clip": 0.01523947, + "auxiliary_loss_mlp": 0.01290147, + "balance_loss_clip": 1.16318941, + "balance_loss_mlp": 1.03341794, + "epoch": 0.5058618409186557, + "flos": 19027989119520.0, + "grad_norm": 2.276738252797507, + "language_loss": 0.76368296, + "learning_rate": 2.0591931999870713e-06, + "loss": 0.79182386, + "num_input_tokens_seen": 90695935, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.56640625, + "step": 4207, + "time_per_iteration": 2.947997808456421 + }, + { + "auxiliary_loss_clip": 0.01533323, + "auxiliary_loss_mlp": 0.01206757, + "balance_loss_clip": 1.17769396, + "balance_loss_mlp": 0.99771118, + "epoch": 0.5059820838092948, + "flos": 63458958319200.0, + "grad_norm": 0.856374529482692, + "language_loss": 0.57518733, + "learning_rate": 2.0584145651179234e-06, + "loss": 0.60258818, + "num_input_tokens_seen": 90751645, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.09375, + "step": 4208, + "time_per_iteration": 3.3492841720581055 + }, + { + "auxiliary_loss_clip": 0.01522085, + "auxiliary_loss_mlp": 0.0127944, + "balance_loss_clip": 1.16087341, + "balance_loss_mlp": 1.02347422, + "epoch": 0.5061023266999338, + "flos": 15443135445600.0, + "grad_norm": 3.697839850011041, + "language_loss": 0.79790825, + "learning_rate": 2.0576359213873327e-06, + "loss": 0.8259235, + "num_input_tokens_seen": 90766795, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.55859375, + "step": 4209, + "time_per_iteration": 3.134991407394409 + }, + { + "auxiliary_loss_clip": 0.01523188, + "auxiliary_loss_mlp": 0.01293399, + "balance_loss_clip": 1.16285276, + "balance_loss_mlp": 1.03399992, + "epoch": 0.506222569590573, + "flos": 22453012069920.0, + "grad_norm": 3.5221066813259636, + "language_loss": 0.70391512, + "learning_rate": 2.056857268913419e-06, + "loss": 0.73208094, + "num_input_tokens_seen": 90786845, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.59375, + "step": 4210, + "time_per_iteration": 3.0444955825805664 + }, + { + "auxiliary_loss_clip": 0.01521888, + "auxiliary_loss_mlp": 0.01289374, + "balance_loss_clip": 1.16048086, + "balance_loss_mlp": 1.03111887, + "epoch": 0.506342812481212, + "flos": 17560154754720.0, + "grad_norm": 3.194570006547554, + "language_loss": 0.8422761, + "learning_rate": 2.056078607814303e-06, + "loss": 0.87038875, + "num_input_tokens_seen": 90802630, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.58203125, + "step": 4211, + "time_per_iteration": 3.249751091003418 + }, + { + "auxiliary_loss_clip": 0.01522215, + "auxiliary_loss_mlp": 0.01305047, + "balance_loss_clip": 1.16032052, + "balance_loss_mlp": 1.0500344, + "epoch": 0.5064630553718511, + "flos": 23404183938720.0, + "grad_norm": 1.8740038059766335, + "language_loss": 0.78613091, + "learning_rate": 2.055299938208106e-06, + "loss": 0.81440353, + "num_input_tokens_seen": 90823620, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.54882812, + "step": 4212, + "time_per_iteration": 3.158071756362915 + }, + { + "auxiliary_loss_clip": 0.01526762, + "auxiliary_loss_mlp": 0.01314836, + "balance_loss_clip": 1.16763473, + "balance_loss_mlp": 1.05486465, + "epoch": 0.5065832982624903, + "flos": 23989269564000.0, + "grad_norm": 1.6556426022541733, + "language_loss": 0.86227036, + "learning_rate": 2.0545212602129526e-06, + "loss": 0.89068639, + "num_input_tokens_seen": 90843475, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.59960938, + "step": 4213, + "time_per_iteration": 3.9009382724761963 + }, + { + "auxiliary_loss_clip": 0.01523577, + "auxiliary_loss_mlp": 0.01293818, + "balance_loss_clip": 1.16213262, + "balance_loss_mlp": 1.03746986, + "epoch": 0.5067035411531293, + "flos": 21504988238400.0, + "grad_norm": 2.2186558831737613, + "language_loss": 0.66332155, + "learning_rate": 2.0537425739469673e-06, + "loss": 0.69149554, + "num_input_tokens_seen": 90862410, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.5625, + "step": 4214, + "time_per_iteration": 3.9485015869140625 + }, + { + "auxiliary_loss_clip": 0.01525739, + "auxiliary_loss_mlp": 0.012491, + "balance_loss_clip": 1.17130446, + "balance_loss_mlp": 1.0377655, + "epoch": 0.5068237840437684, + "flos": 65940395032800.0, + "grad_norm": 0.8531509626494254, + "language_loss": 0.59420103, + "learning_rate": 2.052963879528276e-06, + "loss": 0.62194943, + "num_input_tokens_seen": 90922280, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.1171875, + "step": 4215, + "time_per_iteration": 3.443253517150879 + }, + { + "auxiliary_loss_clip": 0.01524062, + "auxiliary_loss_mlp": 0.01295724, + "balance_loss_clip": 1.1650157, + "balance_loss_mlp": 1.03880394, + "epoch": 0.5069440269344075, + "flos": 27266257448640.0, + "grad_norm": 2.5506818171170975, + "language_loss": 0.76664096, + "learning_rate": 2.052185177075007e-06, + "loss": 0.79483879, + "num_input_tokens_seen": 90941850, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.56835938, + "step": 4216, + "time_per_iteration": 3.1450681686401367 + }, + { + "auxiliary_loss_clip": 0.01526566, + "auxiliary_loss_mlp": 0.01300975, + "balance_loss_clip": 1.16598749, + "balance_loss_mlp": 1.04157543, + "epoch": 0.5070642698250466, + "flos": 23368493175840.0, + "grad_norm": 2.0781152092856967, + "language_loss": 0.82965231, + "learning_rate": 2.051406466705288e-06, + "loss": 0.85792768, + "num_input_tokens_seen": 90961390, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.59375, + "step": 4217, + "time_per_iteration": 3.8652572631835938 + }, + { + "auxiliary_loss_clip": 0.01514539, + "auxiliary_loss_mlp": 0.01274687, + "balance_loss_clip": 1.15335643, + "balance_loss_mlp": 1.02120066, + "epoch": 0.5071845127156857, + "flos": 20342250907200.0, + "grad_norm": 2.790521536040488, + "language_loss": 0.81134605, + "learning_rate": 2.0506277485372486e-06, + "loss": 0.83923829, + "num_input_tokens_seen": 90980215, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.53320312, + "step": 4218, + "time_per_iteration": 3.1024765968322754 + }, + { + "auxiliary_loss_clip": 0.01524173, + "auxiliary_loss_mlp": 0.0128526, + "balance_loss_clip": 1.16411877, + "balance_loss_mlp": 1.0268147, + "epoch": 0.5073047556063248, + "flos": 12094538466240.0, + "grad_norm": 2.2177193657763725, + "language_loss": 0.67235959, + "learning_rate": 2.04984902268902e-06, + "loss": 0.70045388, + "num_input_tokens_seen": 90997415, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.58398438, + "step": 4219, + "time_per_iteration": 3.0369045734405518 + }, + { + "auxiliary_loss_clip": 0.01529021, + "auxiliary_loss_mlp": 0.01295099, + "balance_loss_clip": 1.16969228, + "balance_loss_mlp": 1.03608108, + "epoch": 0.5074249984969639, + "flos": 19684911408480.0, + "grad_norm": 2.780366279613989, + "language_loss": 0.75973547, + "learning_rate": 2.0490702892787345e-06, + "loss": 0.78797662, + "num_input_tokens_seen": 91016475, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.58984375, + "step": 4220, + "time_per_iteration": 3.1462204456329346 + }, + { + "auxiliary_loss_clip": 0.01522617, + "auxiliary_loss_mlp": 0.01290498, + "balance_loss_clip": 1.16284013, + "balance_loss_mlp": 1.03739285, + "epoch": 0.5075452413876029, + "flos": 28768151665440.0, + "grad_norm": 1.6563477143796026, + "language_loss": 0.62321472, + "learning_rate": 2.0482915484245246e-06, + "loss": 0.65134591, + "num_input_tokens_seen": 91038095, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.52929688, + "step": 4221, + "time_per_iteration": 3.9240076541900635 + }, + { + "auxiliary_loss_clip": 0.01532671, + "auxiliary_loss_mlp": 0.01277174, + "balance_loss_clip": 1.17318892, + "balance_loss_mlp": 1.01910949, + "epoch": 0.5076654842782421, + "flos": 20341530272160.0, + "grad_norm": 4.16044821166844, + "language_loss": 0.84238726, + "learning_rate": 2.047512800244526e-06, + "loss": 0.87048578, + "num_input_tokens_seen": 91053360, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.58007812, + "step": 4222, + "time_per_iteration": 3.1486427783966064 + }, + { + "auxiliary_loss_clip": 0.01524372, + "auxiliary_loss_mlp": 0.0129311, + "balance_loss_clip": 1.16436148, + "balance_loss_mlp": 1.04057658, + "epoch": 0.5077857271688812, + "flos": 26362078934400.0, + "grad_norm": 2.1509502666389424, + "language_loss": 0.79091758, + "learning_rate": 2.046734044856873e-06, + "loss": 0.81909239, + "num_input_tokens_seen": 91072770, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.5234375, + "step": 4223, + "time_per_iteration": 3.0633208751678467 + }, + { + "auxiliary_loss_clip": 0.01525405, + "auxiliary_loss_mlp": 0.01282126, + "balance_loss_clip": 1.1644125, + "balance_loss_mlp": 1.03054619, + "epoch": 0.5079059700595202, + "flos": 21801134226240.0, + "grad_norm": 2.2079082540670476, + "language_loss": 0.8146463, + "learning_rate": 2.045955282379702e-06, + "loss": 0.84272158, + "num_input_tokens_seen": 91091430, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.51367188, + "step": 4224, + "time_per_iteration": 2.9942047595977783 + }, + { + "auxiliary_loss_clip": 0.01527466, + "auxiliary_loss_mlp": 0.01297565, + "balance_loss_clip": 1.1674242, + "balance_loss_mlp": 1.03911936, + "epoch": 0.5080262129501594, + "flos": 13189611231360.0, + "grad_norm": 4.523532781749786, + "language_loss": 0.75535691, + "learning_rate": 2.045176512931152e-06, + "loss": 0.78360718, + "num_input_tokens_seen": 91106060, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.58398438, + "step": 4225, + "time_per_iteration": 2.9213812351226807 + }, + { + "auxiliary_loss_clip": 0.01523801, + "auxiliary_loss_mlp": 0.01280415, + "balance_loss_clip": 1.16337955, + "balance_loss_mlp": 1.02578425, + "epoch": 0.5081464558407984, + "flos": 25303834776960.0, + "grad_norm": 1.9448362515333752, + "language_loss": 0.76177382, + "learning_rate": 2.0443977366293604e-06, + "loss": 0.78981602, + "num_input_tokens_seen": 91124100, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.54492188, + "step": 4226, + "time_per_iteration": 2.991603374481201 + }, + { + "auxiliary_loss_clip": 0.01526979, + "auxiliary_loss_mlp": 0.01282465, + "balance_loss_clip": 1.16594076, + "balance_loss_mlp": 1.02478218, + "epoch": 0.5082666987314375, + "flos": 30953594103840.0, + "grad_norm": 1.636648352981939, + "language_loss": 0.7699213, + "learning_rate": 2.043618953592468e-06, + "loss": 0.79801571, + "num_input_tokens_seen": 91146555, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.57617188, + "step": 4227, + "time_per_iteration": 3.066776752471924 + }, + { + "auxiliary_loss_clip": 0.01528193, + "auxiliary_loss_mlp": 0.01284971, + "balance_loss_clip": 1.16632354, + "balance_loss_mlp": 1.0282414, + "epoch": 0.5083869416220766, + "flos": 19684532126880.0, + "grad_norm": 1.8787703267143585, + "language_loss": 0.81027734, + "learning_rate": 2.0428401639386144e-06, + "loss": 0.83840901, + "num_input_tokens_seen": 91167120, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.56640625, + "step": 4228, + "time_per_iteration": 3.0321781635284424 + }, + { + "auxiliary_loss_clip": 0.01537461, + "auxiliary_loss_mlp": 0.01204735, + "balance_loss_clip": 1.18404341, + "balance_loss_mlp": 0.99874115, + "epoch": 0.5085071845127157, + "flos": 71824362933600.0, + "grad_norm": 0.8248743834036206, + "language_loss": 0.58000207, + "learning_rate": 2.042061367785943e-06, + "loss": 0.60742402, + "num_input_tokens_seen": 91220260, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.0625, + "step": 4229, + "time_per_iteration": 3.418818950653076 + }, + { + "auxiliary_loss_clip": 0.01521729, + "auxiliary_loss_mlp": 0.01292916, + "balance_loss_clip": 1.16127884, + "balance_loss_mlp": 1.03561449, + "epoch": 0.5086274274033548, + "flos": 35954358127200.0, + "grad_norm": 4.837363558136184, + "language_loss": 0.74852675, + "learning_rate": 2.041282565252594e-06, + "loss": 0.7766732, + "num_input_tokens_seen": 91240425, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.57226562, + "step": 4230, + "time_per_iteration": 3.196626663208008 + }, + { + "auxiliary_loss_clip": 0.01528634, + "auxiliary_loss_mlp": 0.01281913, + "balance_loss_clip": 1.1677916, + "balance_loss_mlp": 1.02804494, + "epoch": 0.5087476702939938, + "flos": 23515997247360.0, + "grad_norm": 2.1487128143296905, + "language_loss": 0.76887566, + "learning_rate": 2.040503756456714e-06, + "loss": 0.7969811, + "num_input_tokens_seen": 91259635, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.53710938, + "step": 4231, + "time_per_iteration": 3.0447158813476562 + }, + { + "auxiliary_loss_clip": 0.01522282, + "auxiliary_loss_mlp": 0.01288554, + "balance_loss_clip": 1.16237414, + "balance_loss_mlp": 1.03144383, + "epoch": 0.508867913184633, + "flos": 15123890844000.0, + "grad_norm": 2.1468946076796835, + "language_loss": 0.79238302, + "learning_rate": 2.0397249415164456e-06, + "loss": 0.82049137, + "num_input_tokens_seen": 91276990, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.5703125, + "step": 4232, + "time_per_iteration": 3.1161446571350098 + }, + { + "auxiliary_loss_clip": 0.01528017, + "auxiliary_loss_mlp": 0.01297751, + "balance_loss_clip": 1.16576719, + "balance_loss_mlp": 1.04064071, + "epoch": 0.508988156075272, + "flos": 25887706701120.0, + "grad_norm": 2.54910195535857, + "language_loss": 0.80109286, + "learning_rate": 2.0389461205499354e-06, + "loss": 0.82935047, + "num_input_tokens_seen": 91296125, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.5703125, + "step": 4233, + "time_per_iteration": 3.180763006210327 + }, + { + "auxiliary_loss_clip": 0.01524029, + "auxiliary_loss_mlp": 0.0127891, + "balance_loss_clip": 1.16395736, + "balance_loss_mlp": 1.02466011, + "epoch": 0.5091083989659111, + "flos": 13846571448480.0, + "grad_norm": 2.209031263984394, + "language_loss": 0.73245406, + "learning_rate": 2.03816729367533e-06, + "loss": 0.76048344, + "num_input_tokens_seen": 91314280, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.54101562, + "step": 4234, + "time_per_iteration": 3.2067551612854004 + }, + { + "auxiliary_loss_clip": 0.01524848, + "auxiliary_loss_mlp": 0.01283365, + "balance_loss_clip": 1.16341019, + "balance_loss_mlp": 1.02530074, + "epoch": 0.5092286418565503, + "flos": 21106966119840.0, + "grad_norm": 3.1511208759940743, + "language_loss": 0.71571958, + "learning_rate": 2.0373884610107765e-06, + "loss": 0.74380171, + "num_input_tokens_seen": 91334595, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.58007812, + "step": 4235, + "time_per_iteration": 3.175987720489502 + }, + { + "auxiliary_loss_clip": 0.01520763, + "auxiliary_loss_mlp": 0.01285682, + "balance_loss_clip": 1.16018915, + "balance_loss_mlp": 1.02914321, + "epoch": 0.5093488847471893, + "flos": 18623102004000.0, + "grad_norm": 3.654491875910353, + "language_loss": 0.69474506, + "learning_rate": 2.0366096226744225e-06, + "loss": 0.72280943, + "num_input_tokens_seen": 91349790, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.56445312, + "step": 4236, + "time_per_iteration": 3.0844480991363525 + }, + { + "auxiliary_loss_clip": 0.01519972, + "auxiliary_loss_mlp": 0.01293136, + "balance_loss_clip": 1.15842199, + "balance_loss_mlp": 1.03716969, + "epoch": 0.5094691276378284, + "flos": 23805960945120.0, + "grad_norm": 2.29700017554645, + "language_loss": 0.76386487, + "learning_rate": 2.035830778784418e-06, + "loss": 0.79199594, + "num_input_tokens_seen": 91370465, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.55859375, + "step": 4237, + "time_per_iteration": 3.120009183883667 + }, + { + "auxiliary_loss_clip": 0.01528833, + "auxiliary_loss_mlp": 0.01293107, + "balance_loss_clip": 1.16585016, + "balance_loss_mlp": 1.03599596, + "epoch": 0.5095893705284675, + "flos": 17422322364000.0, + "grad_norm": 3.915261747458861, + "language_loss": 0.80398059, + "learning_rate": 2.0350519294589134e-06, + "loss": 0.83220005, + "num_input_tokens_seen": 91388505, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.5703125, + "step": 4238, + "time_per_iteration": 3.0737226009368896 + }, + { + "auxiliary_loss_clip": 0.01514117, + "auxiliary_loss_mlp": 0.01289105, + "balance_loss_clip": 1.15358067, + "balance_loss_mlp": 1.02951503, + "epoch": 0.5097096134191066, + "flos": 25851788369280.0, + "grad_norm": 2.481070778911992, + "language_loss": 0.82935405, + "learning_rate": 2.0342730748160588e-06, + "loss": 0.85738629, + "num_input_tokens_seen": 91408970, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.59570312, + "step": 4239, + "time_per_iteration": 3.0510404109954834 + }, + { + "auxiliary_loss_clip": 0.01524126, + "auxiliary_loss_mlp": 0.01294856, + "balance_loss_clip": 1.16259313, + "balance_loss_mlp": 1.03431225, + "epoch": 0.5098298563097456, + "flos": 27747570535200.0, + "grad_norm": 3.041809802489082, + "language_loss": 0.71100575, + "learning_rate": 2.033494214974006e-06, + "loss": 0.73919559, + "num_input_tokens_seen": 91430115, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.60546875, + "step": 4240, + "time_per_iteration": 3.123314619064331 + }, + { + "auxiliary_loss_clip": 0.01522353, + "auxiliary_loss_mlp": 0.01276366, + "balance_loss_clip": 1.16016173, + "balance_loss_mlp": 1.0238328, + "epoch": 0.5099500992003848, + "flos": 21360935629440.0, + "grad_norm": 2.7035129667908824, + "language_loss": 0.83803731, + "learning_rate": 2.0327153500509067e-06, + "loss": 0.86602449, + "num_input_tokens_seen": 91449140, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.5234375, + "step": 4241, + "time_per_iteration": 4.750185251235962 + }, + { + "auxiliary_loss_clip": 0.01522171, + "auxiliary_loss_mlp": 0.01300394, + "balance_loss_clip": 1.16075516, + "balance_loss_mlp": 1.04137576, + "epoch": 0.5100703420910239, + "flos": 19868106242880.0, + "grad_norm": 2.599827793336195, + "language_loss": 0.84459311, + "learning_rate": 2.031936480164916e-06, + "loss": 0.87281877, + "num_input_tokens_seen": 91466880, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.58984375, + "step": 4242, + "time_per_iteration": 2.9587152004241943 + }, + { + "auxiliary_loss_clip": 0.01522042, + "auxiliary_loss_mlp": 0.0128931, + "balance_loss_clip": 1.16011894, + "balance_loss_mlp": 1.03124619, + "epoch": 0.5101905849816629, + "flos": 24651350082720.0, + "grad_norm": 2.261719883205442, + "language_loss": 0.80071485, + "learning_rate": 2.0311576054341857e-06, + "loss": 0.82882833, + "num_input_tokens_seen": 91487495, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.58007812, + "step": 4243, + "time_per_iteration": 3.0109989643096924 + }, + { + "auxiliary_loss_clip": 0.01529601, + "auxiliary_loss_mlp": 0.01287294, + "balance_loss_clip": 1.16830695, + "balance_loss_mlp": 1.02808547, + "epoch": 0.5103108278723021, + "flos": 22932314964000.0, + "grad_norm": 1.8351851068868261, + "language_loss": 0.62772256, + "learning_rate": 2.0303787259768715e-06, + "loss": 0.65589154, + "num_input_tokens_seen": 91508395, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.59179688, + "step": 4244, + "time_per_iteration": 2.985403299331665 + }, + { + "auxiliary_loss_clip": 0.01529216, + "auxiliary_loss_mlp": 0.01278189, + "balance_loss_clip": 1.16567504, + "balance_loss_mlp": 1.02355766, + "epoch": 0.5104310707629411, + "flos": 21508743126240.0, + "grad_norm": 2.64397530891846, + "language_loss": 0.69614196, + "learning_rate": 2.0295998419111294e-06, + "loss": 0.72421598, + "num_input_tokens_seen": 91525685, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.54492188, + "step": 4245, + "time_per_iteration": 3.8846278190612793 + }, + { + "auxiliary_loss_clip": 0.01525244, + "auxiliary_loss_mlp": 0.01285428, + "balance_loss_clip": 1.16276968, + "balance_loss_mlp": 1.02946162, + "epoch": 0.5105513136535802, + "flos": 14904967318560.0, + "grad_norm": 2.6761352169047736, + "language_loss": 0.73901045, + "learning_rate": 2.028820953355115e-06, + "loss": 0.76711714, + "num_input_tokens_seen": 91543785, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.55859375, + "step": 4246, + "time_per_iteration": 3.0865702629089355 + }, + { + "auxiliary_loss_clip": 0.01521319, + "auxiliary_loss_mlp": 0.01295894, + "balance_loss_clip": 1.16031802, + "balance_loss_mlp": 1.03554058, + "epoch": 0.5106715565442194, + "flos": 22604460670080.0, + "grad_norm": 2.673260424557005, + "language_loss": 0.78712058, + "learning_rate": 2.0280420604269834e-06, + "loss": 0.81529272, + "num_input_tokens_seen": 91563325, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.60351562, + "step": 4247, + "time_per_iteration": 3.0123414993286133 + }, + { + "auxiliary_loss_clip": 0.01531115, + "auxiliary_loss_mlp": 0.01203583, + "balance_loss_clip": 1.17480791, + "balance_loss_mlp": 0.99530029, + "epoch": 0.5107917994348584, + "flos": 71035145765280.0, + "grad_norm": 0.7460005551699732, + "language_loss": 0.58886397, + "learning_rate": 2.027263163244895e-06, + "loss": 0.61621094, + "num_input_tokens_seen": 91632450, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0859375, + "step": 4248, + "time_per_iteration": 4.45014500617981 + }, + { + "auxiliary_loss_clip": 0.01520131, + "auxiliary_loss_mlp": 0.01280125, + "balance_loss_clip": 1.15875721, + "balance_loss_mlp": 1.02358627, + "epoch": 0.5109120423254975, + "flos": 24829765968960.0, + "grad_norm": 1.7745533513618175, + "language_loss": 0.74439138, + "learning_rate": 2.026484261927005e-06, + "loss": 0.77239394, + "num_input_tokens_seen": 91651945, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.56445312, + "step": 4249, + "time_per_iteration": 3.1218976974487305 + }, + { + "auxiliary_loss_clip": 0.01519871, + "auxiliary_loss_mlp": 0.01292447, + "balance_loss_clip": 1.15638661, + "balance_loss_mlp": 1.03190351, + "epoch": 0.5110322852161366, + "flos": 21249577458720.0, + "grad_norm": 2.571191015299499, + "language_loss": 0.74001634, + "learning_rate": 2.025705356591475e-06, + "loss": 0.76813948, + "num_input_tokens_seen": 91669635, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.60546875, + "step": 4250, + "time_per_iteration": 3.08193302154541 + }, + { + "auxiliary_loss_clip": 0.01529588, + "auxiliary_loss_mlp": 0.01195145, + "balance_loss_clip": 1.17253184, + "balance_loss_mlp": 0.98991394, + "epoch": 0.5111525281067757, + "flos": 66463808741280.0, + "grad_norm": 0.76392702855972, + "language_loss": 0.57915527, + "learning_rate": 2.024926447356462e-06, + "loss": 0.60640258, + "num_input_tokens_seen": 91731920, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0546875, + "step": 4251, + "time_per_iteration": 3.3482916355133057 + }, + { + "auxiliary_loss_clip": 0.0151598, + "auxiliary_loss_mlp": 0.01296948, + "balance_loss_clip": 1.15160227, + "balance_loss_mlp": 1.03468788, + "epoch": 0.5112727709974147, + "flos": 14868517992480.0, + "grad_norm": 2.3004182954276615, + "language_loss": 0.78748703, + "learning_rate": 2.024147534340127e-06, + "loss": 0.81561631, + "num_input_tokens_seen": 91749780, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.62304688, + "step": 4252, + "time_per_iteration": 3.121974229812622 + }, + { + "auxiliary_loss_clip": 0.015186, + "auxiliary_loss_mlp": 0.01286925, + "balance_loss_clip": 1.15696979, + "balance_loss_mlp": 1.03019595, + "epoch": 0.5113930138880539, + "flos": 21179675131200.0, + "grad_norm": 2.1811030378623433, + "language_loss": 0.79798377, + "learning_rate": 2.02336861766063e-06, + "loss": 0.82603908, + "num_input_tokens_seen": 91768840, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.56640625, + "step": 4253, + "time_per_iteration": 3.1698567867279053 + }, + { + "auxiliary_loss_clip": 0.01526721, + "auxiliary_loss_mlp": 0.01286328, + "balance_loss_clip": 1.16456008, + "balance_loss_mlp": 1.02807331, + "epoch": 0.511513256778693, + "flos": 20411015389920.0, + "grad_norm": 1.7034181715197851, + "language_loss": 0.7876631, + "learning_rate": 2.0225896974361327e-06, + "loss": 0.81579363, + "num_input_tokens_seen": 91788945, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.58203125, + "step": 4254, + "time_per_iteration": 3.0359079837799072 + }, + { + "auxiliary_loss_clip": 0.01527907, + "auxiliary_loss_mlp": 0.01224457, + "balance_loss_clip": 1.17042077, + "balance_loss_mlp": 1.01617432, + "epoch": 0.511633499669332, + "flos": 69886480145760.0, + "grad_norm": 0.865533288250927, + "language_loss": 0.59903133, + "learning_rate": 2.0218107737847962e-06, + "loss": 0.62655497, + "num_input_tokens_seen": 91850990, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.0859375, + "step": 4255, + "time_per_iteration": 3.4296910762786865 + }, + { + "auxiliary_loss_clip": 0.0152105, + "auxiliary_loss_mlp": 0.01291138, + "balance_loss_clip": 1.15779376, + "balance_loss_mlp": 1.03421783, + "epoch": 0.5117537425599712, + "flos": 24100172596800.0, + "grad_norm": 3.3242830759644306, + "language_loss": 0.74861252, + "learning_rate": 2.0210318468247826e-06, + "loss": 0.77673435, + "num_input_tokens_seen": 91869960, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.56835938, + "step": 4256, + "time_per_iteration": 3.2038235664367676 + }, + { + "auxiliary_loss_clip": 0.01511136, + "auxiliary_loss_mlp": 0.01278336, + "balance_loss_clip": 1.14676404, + "balance_loss_mlp": 1.02332377, + "epoch": 0.5118739854506102, + "flos": 20961661881600.0, + "grad_norm": 2.274242553330964, + "language_loss": 0.82029533, + "learning_rate": 2.020252916674255e-06, + "loss": 0.84819007, + "num_input_tokens_seen": 91889075, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.54882812, + "step": 4257, + "time_per_iteration": 3.109794855117798 + }, + { + "auxiliary_loss_clip": 0.01518593, + "auxiliary_loss_mlp": 0.01277245, + "balance_loss_clip": 1.15747905, + "balance_loss_mlp": 1.022614, + "epoch": 0.5119942283412493, + "flos": 17459264756160.0, + "grad_norm": 2.087227881768205, + "language_loss": 0.81071061, + "learning_rate": 2.019473983451375e-06, + "loss": 0.83866894, + "num_input_tokens_seen": 91907495, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.54492188, + "step": 4258, + "time_per_iteration": 3.2372641563415527 + }, + { + "auxiliary_loss_clip": 0.01518111, + "auxiliary_loss_mlp": 0.01281314, + "balance_loss_clip": 1.15735185, + "balance_loss_mlp": 1.02401233, + "epoch": 0.5121144712318885, + "flos": 21068734170240.0, + "grad_norm": 1.9983744622027386, + "language_loss": 0.71553409, + "learning_rate": 2.0186950472743076e-06, + "loss": 0.74352837, + "num_input_tokens_seen": 91927400, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.57226562, + "step": 4259, + "time_per_iteration": 3.028208017349243 + }, + { + "auxiliary_loss_clip": 0.01517887, + "auxiliary_loss_mlp": 0.01275219, + "balance_loss_clip": 1.15649927, + "balance_loss_mlp": 1.01772654, + "epoch": 0.5122347141225275, + "flos": 19862606659680.0, + "grad_norm": 2.2296861172809916, + "language_loss": 0.73672134, + "learning_rate": 2.0179161082612162e-06, + "loss": 0.76465243, + "num_input_tokens_seen": 91946790, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.57421875, + "step": 4260, + "time_per_iteration": 3.005953073501587 + }, + { + "auxiliary_loss_clip": 0.01514284, + "auxiliary_loss_mlp": 0.01281621, + "balance_loss_clip": 1.15092683, + "balance_loss_mlp": 1.02584577, + "epoch": 0.5123549570131666, + "flos": 22530689670240.0, + "grad_norm": 2.0180182353837055, + "language_loss": 0.72621387, + "learning_rate": 2.017137166530266e-06, + "loss": 0.75417292, + "num_input_tokens_seen": 91966325, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.55664062, + "step": 4261, + "time_per_iteration": 3.055804967880249 + }, + { + "auxiliary_loss_clip": 0.01516828, + "auxiliary_loss_mlp": 0.01273948, + "balance_loss_clip": 1.15534818, + "balance_loss_mlp": 1.01836276, + "epoch": 0.5124751999038056, + "flos": 20335120413120.0, + "grad_norm": 3.918546201168176, + "language_loss": 0.80176961, + "learning_rate": 2.0163582221996213e-06, + "loss": 0.82967734, + "num_input_tokens_seen": 91984700, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.5546875, + "step": 4262, + "time_per_iteration": 2.9813225269317627 + }, + { + "auxiliary_loss_clip": 0.01518435, + "auxiliary_loss_mlp": 0.01284185, + "balance_loss_clip": 1.15644455, + "balance_loss_mlp": 1.02631152, + "epoch": 0.5125954427944448, + "flos": 39789122997600.0, + "grad_norm": 2.3182089239399737, + "language_loss": 0.68579161, + "learning_rate": 2.015579275387446e-06, + "loss": 0.71381783, + "num_input_tokens_seen": 92010020, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.578125, + "step": 4263, + "time_per_iteration": 3.1620826721191406 + }, + { + "auxiliary_loss_clip": 0.01519491, + "auxiliary_loss_mlp": 0.01282933, + "balance_loss_clip": 1.15670121, + "balance_loss_mlp": 1.02811122, + "epoch": 0.5127156856850839, + "flos": 29207970980640.0, + "grad_norm": 2.486501140200056, + "language_loss": 0.68739128, + "learning_rate": 2.0148003262119085e-06, + "loss": 0.71541554, + "num_input_tokens_seen": 92030990, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.546875, + "step": 4264, + "time_per_iteration": 3.0358152389526367 + }, + { + "auxiliary_loss_clip": 0.01519836, + "auxiliary_loss_mlp": 0.01283235, + "balance_loss_clip": 1.15837264, + "balance_loss_mlp": 1.02478909, + "epoch": 0.5128359285757229, + "flos": 13555431977760.0, + "grad_norm": 1.7825693874513484, + "language_loss": 0.76787758, + "learning_rate": 2.0140213747911728e-06, + "loss": 0.79590833, + "num_input_tokens_seen": 92049525, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.58398438, + "step": 4265, + "time_per_iteration": 2.9534242153167725 + }, + { + "auxiliary_loss_clip": 0.01521856, + "auxiliary_loss_mlp": 0.01277577, + "balance_loss_clip": 1.15956974, + "balance_loss_mlp": 1.01875007, + "epoch": 0.5129561714663621, + "flos": 25194448870560.0, + "grad_norm": 3.979955344188917, + "language_loss": 0.80925477, + "learning_rate": 2.013242421243406e-06, + "loss": 0.83724904, + "num_input_tokens_seen": 92068430, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.58789062, + "step": 4266, + "time_per_iteration": 3.0145790576934814 + }, + { + "auxiliary_loss_clip": 0.01521423, + "auxiliary_loss_mlp": 0.01271826, + "balance_loss_clip": 1.15917182, + "balance_loss_mlp": 1.01643181, + "epoch": 0.5130764143570011, + "flos": 18152181233280.0, + "grad_norm": 1.8907167199050772, + "language_loss": 0.7873559, + "learning_rate": 2.012463465686774e-06, + "loss": 0.81528836, + "num_input_tokens_seen": 92088180, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.55273438, + "step": 4267, + "time_per_iteration": 2.9892446994781494 + }, + { + "auxiliary_loss_clip": 0.01513508, + "auxiliary_loss_mlp": 0.01204285, + "balance_loss_clip": 1.15756106, + "balance_loss_mlp": 0.99752808, + "epoch": 0.5131966572476402, + "flos": 59802040048320.0, + "grad_norm": 0.7850133663042071, + "language_loss": 0.54697531, + "learning_rate": 2.0116845082394446e-06, + "loss": 0.57415324, + "num_input_tokens_seen": 92153015, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0703125, + "step": 4268, + "time_per_iteration": 5.391507148742676 + }, + { + "auxiliary_loss_clip": 0.01519783, + "auxiliary_loss_mlp": 0.01284045, + "balance_loss_clip": 1.15854859, + "balance_loss_mlp": 1.02559972, + "epoch": 0.5133169001382794, + "flos": 18517205488320.0, + "grad_norm": 2.2037238959333263, + "language_loss": 0.78803146, + "learning_rate": 2.0109055490195836e-06, + "loss": 0.81606972, + "num_input_tokens_seen": 92171470, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.58398438, + "step": 4269, + "time_per_iteration": 3.061837911605835 + }, + { + "auxiliary_loss_clip": 0.01513862, + "auxiliary_loss_mlp": 0.01278327, + "balance_loss_clip": 1.15148509, + "balance_loss_mlp": 1.0215981, + "epoch": 0.5134371430289184, + "flos": 15525364425120.0, + "grad_norm": 2.596331044428542, + "language_loss": 0.64060581, + "learning_rate": 2.0101265881453605e-06, + "loss": 0.66852772, + "num_input_tokens_seen": 92189945, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.56640625, + "step": 4270, + "time_per_iteration": 3.027754306793213 + }, + { + "auxiliary_loss_clip": 0.01516854, + "auxiliary_loss_mlp": 0.01282881, + "balance_loss_clip": 1.15428793, + "balance_loss_mlp": 1.02901328, + "epoch": 0.5135573859195575, + "flos": 21435920330400.0, + "grad_norm": 2.178504824911716, + "language_loss": 0.78156149, + "learning_rate": 2.009347625734941e-06, + "loss": 0.80955887, + "num_input_tokens_seen": 92209855, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.53710938, + "step": 4271, + "time_per_iteration": 3.004389762878418 + }, + { + "auxiliary_loss_clip": 0.01518604, + "auxiliary_loss_mlp": 0.0128348, + "balance_loss_clip": 1.15726912, + "balance_loss_mlp": 1.02770424, + "epoch": 0.5136776288101966, + "flos": 17714599679520.0, + "grad_norm": 9.255512590092788, + "language_loss": 0.75295752, + "learning_rate": 2.0085686619064954e-06, + "loss": 0.78097832, + "num_input_tokens_seen": 92226295, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.55664062, + "step": 4272, + "time_per_iteration": 3.8100454807281494 + }, + { + "auxiliary_loss_clip": 0.01516042, + "auxiliary_loss_mlp": 0.01298878, + "balance_loss_clip": 1.15401638, + "balance_loss_mlp": 1.04062319, + "epoch": 0.5137978717008357, + "flos": 16583570654400.0, + "grad_norm": 2.425716630031114, + "language_loss": 0.83141446, + "learning_rate": 2.00778969677819e-06, + "loss": 0.85956365, + "num_input_tokens_seen": 92243330, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.58203125, + "step": 4273, + "time_per_iteration": 3.0376386642456055 + }, + { + "auxiliary_loss_clip": 0.01520471, + "auxiliary_loss_mlp": 0.01277203, + "balance_loss_clip": 1.15844512, + "balance_loss_mlp": 1.0225724, + "epoch": 0.5139181145914747, + "flos": 20670370698240.0, + "grad_norm": 2.2511844560480974, + "language_loss": 0.63952017, + "learning_rate": 2.0070107304681934e-06, + "loss": 0.66749692, + "num_input_tokens_seen": 92262285, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.54492188, + "step": 4274, + "time_per_iteration": 3.049621820449829 + }, + { + "auxiliary_loss_clip": 0.01520316, + "auxiliary_loss_mlp": 0.01282587, + "balance_loss_clip": 1.15816367, + "balance_loss_mlp": 1.02490425, + "epoch": 0.5140383574821139, + "flos": 32929784697600.0, + "grad_norm": 1.957178754877165, + "language_loss": 0.78113192, + "learning_rate": 2.006231763094675e-06, + "loss": 0.80916095, + "num_input_tokens_seen": 92283305, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.57617188, + "step": 4275, + "time_per_iteration": 3.9743144512176514 + }, + { + "auxiliary_loss_clip": 0.01520538, + "auxiliary_loss_mlp": 0.01280225, + "balance_loss_clip": 1.15736115, + "balance_loss_mlp": 1.02769172, + "epoch": 0.514158600372753, + "flos": 19539152032320.0, + "grad_norm": 2.062475592047404, + "language_loss": 0.87678653, + "learning_rate": 2.0054527947758027e-06, + "loss": 0.90479422, + "num_input_tokens_seen": 92302105, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.5234375, + "step": 4276, + "time_per_iteration": 3.0801842212677 + }, + { + "auxiliary_loss_clip": 0.01509739, + "auxiliary_loss_mlp": 0.01212807, + "balance_loss_clip": 1.15380681, + "balance_loss_mlp": 1.00605011, + "epoch": 0.514278843263392, + "flos": 62530732987200.0, + "grad_norm": 0.727518288350531, + "language_loss": 0.55897987, + "learning_rate": 2.004673825629746e-06, + "loss": 0.58620536, + "num_input_tokens_seen": 92362885, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0703125, + "step": 4277, + "time_per_iteration": 3.482882261276245 + }, + { + "auxiliary_loss_clip": 0.0151409, + "auxiliary_loss_mlp": 0.01283458, + "balance_loss_clip": 1.15164995, + "balance_loss_mlp": 1.02768254, + "epoch": 0.5143990861540312, + "flos": 25884558663840.0, + "grad_norm": 1.7484638568159654, + "language_loss": 0.72448003, + "learning_rate": 2.0038948557746744e-06, + "loss": 0.75245559, + "num_input_tokens_seen": 92384740, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.55664062, + "step": 4278, + "time_per_iteration": 3.178283214569092 + }, + { + "auxiliary_loss_clip": 0.01512808, + "auxiliary_loss_mlp": 0.01283949, + "balance_loss_clip": 1.14986646, + "balance_loss_mlp": 1.02607536, + "epoch": 0.5145193290446702, + "flos": 23332878269280.0, + "grad_norm": 1.8309270512954734, + "language_loss": 0.75011462, + "learning_rate": 2.0031158853287558e-06, + "loss": 0.77808219, + "num_input_tokens_seen": 92405175, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.578125, + "step": 4279, + "time_per_iteration": 3.2414064407348633 + }, + { + "auxiliary_loss_clip": 0.01517435, + "auxiliary_loss_mlp": 0.01280647, + "balance_loss_clip": 1.15558827, + "balance_loss_mlp": 1.02334595, + "epoch": 0.5146395719353093, + "flos": 22858809461280.0, + "grad_norm": 2.371330090973028, + "language_loss": 0.70551097, + "learning_rate": 2.0023369144101593e-06, + "loss": 0.7334919, + "num_input_tokens_seen": 92423345, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.57226562, + "step": 4280, + "time_per_iteration": 2.965118885040283 + }, + { + "auxiliary_loss_clip": 0.01510969, + "auxiliary_loss_mlp": 0.01272984, + "balance_loss_clip": 1.14658034, + "balance_loss_mlp": 1.01701784, + "epoch": 0.5147598148259485, + "flos": 26393749312320.0, + "grad_norm": 1.655849637931615, + "language_loss": 0.76588321, + "learning_rate": 2.0015579431370555e-06, + "loss": 0.79372269, + "num_input_tokens_seen": 92445025, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.55859375, + "step": 4281, + "time_per_iteration": 3.0533664226531982 + }, + { + "auxiliary_loss_clip": 0.01516675, + "auxiliary_loss_mlp": 0.01273971, + "balance_loss_clip": 1.15356696, + "balance_loss_mlp": 1.02048457, + "epoch": 0.5148800577165875, + "flos": 29967983101440.0, + "grad_norm": 4.089290524523151, + "language_loss": 0.69852382, + "learning_rate": 2.000778971627612e-06, + "loss": 0.7264303, + "num_input_tokens_seen": 92464490, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.53320312, + "step": 4282, + "time_per_iteration": 2.984006643295288 + }, + { + "auxiliary_loss_clip": 0.01516486, + "auxiliary_loss_mlp": 0.01277869, + "balance_loss_clip": 1.15343428, + "balance_loss_mlp": 1.02133048, + "epoch": 0.5150003006072266, + "flos": 17933333564160.0, + "grad_norm": 14.343802455991996, + "language_loss": 0.90472341, + "learning_rate": 2e-06, + "loss": 0.93266696, + "num_input_tokens_seen": 92482085, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.56445312, + "step": 4283, + "time_per_iteration": 2.964122772216797 + }, + { + "auxiliary_loss_clip": 0.01513882, + "auxiliary_loss_mlp": 0.01288809, + "balance_loss_clip": 1.14990306, + "balance_loss_mlp": 1.03284228, + "epoch": 0.5151205434978657, + "flos": 18480718234080.0, + "grad_norm": 1.9427018620756362, + "language_loss": 0.85664397, + "learning_rate": 1.9992210283723878e-06, + "loss": 0.88467085, + "num_input_tokens_seen": 92499325, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.55859375, + "step": 4284, + "time_per_iteration": 2.974851608276367 + }, + { + "auxiliary_loss_clip": 0.0151057, + "auxiliary_loss_mlp": 0.01279728, + "balance_loss_clip": 1.14610577, + "balance_loss_mlp": 1.02681386, + "epoch": 0.5152407863885048, + "flos": 25343811421920.0, + "grad_norm": 1.7060735175972686, + "language_loss": 0.7937423, + "learning_rate": 1.9984420568629448e-06, + "loss": 0.82164526, + "num_input_tokens_seen": 92522090, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.52734375, + "step": 4285, + "time_per_iteration": 2.966946840286255 + }, + { + "auxiliary_loss_clip": 0.0150681, + "auxiliary_loss_mlp": 0.01280584, + "balance_loss_clip": 1.14347923, + "balance_loss_mlp": 1.0249989, + "epoch": 0.5153610292791438, + "flos": 18331545323520.0, + "grad_norm": 2.610656566036796, + "language_loss": 0.78116906, + "learning_rate": 1.9976630855898405e-06, + "loss": 0.80904299, + "num_input_tokens_seen": 92539845, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.5546875, + "step": 4286, + "time_per_iteration": 2.9892444610595703 + }, + { + "auxiliary_loss_clip": 0.01506801, + "auxiliary_loss_mlp": 0.01283332, + "balance_loss_clip": 1.14304137, + "balance_loss_mlp": 1.02851033, + "epoch": 0.515481272169783, + "flos": 30412619292960.0, + "grad_norm": 2.939895757557855, + "language_loss": 0.74782652, + "learning_rate": 1.9968841146712445e-06, + "loss": 0.77572787, + "num_input_tokens_seen": 92559460, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.546875, + "step": 4287, + "time_per_iteration": 3.0687081813812256 + }, + { + "auxiliary_loss_clip": 0.01514551, + "auxiliary_loss_mlp": 0.0127588, + "balance_loss_clip": 1.15160871, + "balance_loss_mlp": 1.01953197, + "epoch": 0.5156015150604221, + "flos": 23039425180800.0, + "grad_norm": 1.7199661299261098, + "language_loss": 0.71359289, + "learning_rate": 1.996105144225326e-06, + "loss": 0.74149716, + "num_input_tokens_seen": 92579695, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.5625, + "step": 4288, + "time_per_iteration": 3.059600830078125 + }, + { + "auxiliary_loss_clip": 0.01508135, + "auxiliary_loss_mlp": 0.01292316, + "balance_loss_clip": 1.14434159, + "balance_loss_mlp": 1.03520548, + "epoch": 0.5157217579510611, + "flos": 17860510768320.0, + "grad_norm": 2.436719037977629, + "language_loss": 0.7905854, + "learning_rate": 1.995326174370254e-06, + "loss": 0.81858993, + "num_input_tokens_seen": 92598795, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.5703125, + "step": 4289, + "time_per_iteration": 3.1498255729675293 + }, + { + "auxiliary_loss_clip": 0.01509709, + "auxiliary_loss_mlp": 0.01266026, + "balance_loss_clip": 1.14774179, + "balance_loss_mlp": 1.01311183, + "epoch": 0.5158420008417003, + "flos": 19173482998560.0, + "grad_norm": 1.8615827577603563, + "language_loss": 0.72986722, + "learning_rate": 1.994547205224197e-06, + "loss": 0.75762463, + "num_input_tokens_seen": 92617700, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.52734375, + "step": 4290, + "time_per_iteration": 3.0988526344299316 + }, + { + "auxiliary_loss_clip": 0.0151094, + "auxiliary_loss_mlp": 0.01289222, + "balance_loss_clip": 1.14819419, + "balance_loss_mlp": 1.03306484, + "epoch": 0.5159622437323393, + "flos": 22421758901760.0, + "grad_norm": 2.2017862452956614, + "language_loss": 0.67851532, + "learning_rate": 1.993768236905325e-06, + "loss": 0.70651692, + "num_input_tokens_seen": 92638370, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.56054688, + "step": 4291, + "time_per_iteration": 3.238949775695801 + }, + { + "auxiliary_loss_clip": 0.01509241, + "auxiliary_loss_mlp": 0.01272314, + "balance_loss_clip": 1.14637804, + "balance_loss_mlp": 1.0171113, + "epoch": 0.5160824866229784, + "flos": 24605039435040.0, + "grad_norm": 2.607788299601798, + "language_loss": 0.66472304, + "learning_rate": 1.992989269531807e-06, + "loss": 0.69253862, + "num_input_tokens_seen": 92657180, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.55078125, + "step": 4292, + "time_per_iteration": 3.2566938400268555 + }, + { + "auxiliary_loss_clip": 0.01518337, + "auxiliary_loss_mlp": 0.01284265, + "balance_loss_clip": 1.15539193, + "balance_loss_mlp": 1.02868009, + "epoch": 0.5162027295136175, + "flos": 18004601305440.0, + "grad_norm": 3.1322134239745596, + "language_loss": 0.68891621, + "learning_rate": 1.99221030322181e-06, + "loss": 0.71694219, + "num_input_tokens_seen": 92673985, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.5546875, + "step": 4293, + "time_per_iteration": 3.0082013607025146 + }, + { + "auxiliary_loss_clip": 0.01510329, + "auxiliary_loss_mlp": 0.01279238, + "balance_loss_clip": 1.14615417, + "balance_loss_mlp": 1.02575111, + "epoch": 0.5163229724042566, + "flos": 27347083086240.0, + "grad_norm": 1.8080291109726478, + "language_loss": 0.80525786, + "learning_rate": 1.991431338093505e-06, + "loss": 0.83315349, + "num_input_tokens_seen": 92696340, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.53320312, + "step": 4294, + "time_per_iteration": 3.096513509750366 + }, + { + "auxiliary_loss_clip": 0.01518899, + "auxiliary_loss_mlp": 0.01269332, + "balance_loss_clip": 1.15604913, + "balance_loss_mlp": 1.01641726, + "epoch": 0.5164432152948957, + "flos": 21764950397280.0, + "grad_norm": 1.820655050733046, + "language_loss": 0.79473412, + "learning_rate": 1.9906523742650587e-06, + "loss": 0.82261646, + "num_input_tokens_seen": 92715200, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.52734375, + "step": 4295, + "time_per_iteration": 3.9193623065948486 + }, + { + "auxiliary_loss_clip": 0.01512267, + "auxiliary_loss_mlp": 0.01279509, + "balance_loss_clip": 1.149194, + "balance_loss_mlp": 1.02278018, + "epoch": 0.5165634581855347, + "flos": 25552342631520.0, + "grad_norm": 2.1969282925773514, + "language_loss": 0.77341741, + "learning_rate": 1.9898734118546397e-06, + "loss": 0.8013351, + "num_input_tokens_seen": 92735150, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.56640625, + "step": 4296, + "time_per_iteration": 3.0189430713653564 + }, + { + "auxiliary_loss_clip": 0.0151743, + "auxiliary_loss_mlp": 0.01285582, + "balance_loss_clip": 1.15429175, + "balance_loss_mlp": 1.03018808, + "epoch": 0.5166837010761739, + "flos": 19903228083360.0, + "grad_norm": 2.377217602320162, + "language_loss": 0.80588698, + "learning_rate": 1.989094450980416e-06, + "loss": 0.83391702, + "num_input_tokens_seen": 92755250, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.55273438, + "step": 4297, + "time_per_iteration": 3.1813976764678955 + }, + { + "auxiliary_loss_clip": 0.01506324, + "auxiliary_loss_mlp": 0.01278075, + "balance_loss_clip": 1.14249885, + "balance_loss_mlp": 1.02172709, + "epoch": 0.516803943966813, + "flos": 26648211888000.0, + "grad_norm": 2.4128582911877268, + "language_loss": 0.77001047, + "learning_rate": 1.9883154917605556e-06, + "loss": 0.79785454, + "num_input_tokens_seen": 92774460, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.5625, + "step": 4298, + "time_per_iteration": 2.9925057888031006 + }, + { + "auxiliary_loss_clip": 0.01508049, + "auxiliary_loss_mlp": 0.01280603, + "balance_loss_clip": 1.14239407, + "balance_loss_mlp": 1.02635312, + "epoch": 0.516924186857452, + "flos": 19684911408480.0, + "grad_norm": 2.128165808832664, + "language_loss": 0.8359828, + "learning_rate": 1.9875365343132262e-06, + "loss": 0.86386931, + "num_input_tokens_seen": 92791580, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.54101562, + "step": 4299, + "time_per_iteration": 3.8071069717407227 + }, + { + "auxiliary_loss_clip": 0.01515992, + "auxiliary_loss_mlp": 0.01280526, + "balance_loss_clip": 1.15328991, + "balance_loss_mlp": 1.02475095, + "epoch": 0.5170444297480912, + "flos": 15958167030720.0, + "grad_norm": 2.2484738048831048, + "language_loss": 0.85075641, + "learning_rate": 1.9867575787565946e-06, + "loss": 0.87872154, + "num_input_tokens_seen": 92806240, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.55664062, + "step": 4300, + "time_per_iteration": 3.0001676082611084 + }, + { + "auxiliary_loss_clip": 0.01508876, + "auxiliary_loss_mlp": 0.01290147, + "balance_loss_clip": 1.14613318, + "balance_loss_mlp": 1.03112912, + "epoch": 0.5171646726387302, + "flos": 14175942868800.0, + "grad_norm": 2.3579315067627484, + "language_loss": 0.86156356, + "learning_rate": 1.9859786252088275e-06, + "loss": 0.88955373, + "num_input_tokens_seen": 92823420, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.58984375, + "step": 4301, + "time_per_iteration": 2.9762134552001953 + }, + { + "auxiliary_loss_clip": 0.01508133, + "auxiliary_loss_mlp": 0.01293535, + "balance_loss_clip": 1.14508176, + "balance_loss_mlp": 1.03661501, + "epoch": 0.5172849155293693, + "flos": 23580362063520.0, + "grad_norm": 2.787830089015716, + "language_loss": 0.66123068, + "learning_rate": 1.9851996737880914e-06, + "loss": 0.68924737, + "num_input_tokens_seen": 92838605, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.56835938, + "step": 4302, + "time_per_iteration": 4.0129554271698 + }, + { + "auxiliary_loss_clip": 0.01513433, + "auxiliary_loss_mlp": 0.01291437, + "balance_loss_clip": 1.15043771, + "balance_loss_mlp": 1.03337252, + "epoch": 0.5174051584200084, + "flos": 14285328775200.0, + "grad_norm": 2.754748648016157, + "language_loss": 0.7471385, + "learning_rate": 1.9844207246125537e-06, + "loss": 0.77518725, + "num_input_tokens_seen": 92855185, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.58007812, + "step": 4303, + "time_per_iteration": 3.1332528591156006 + }, + { + "auxiliary_loss_clip": 0.01513241, + "auxiliary_loss_mlp": 0.01283511, + "balance_loss_clip": 1.15025985, + "balance_loss_mlp": 1.02964282, + "epoch": 0.5175254013106475, + "flos": 37892468484000.0, + "grad_norm": 7.469058816160094, + "language_loss": 0.68516803, + "learning_rate": 1.983641777800379e-06, + "loss": 0.71313554, + "num_input_tokens_seen": 92877830, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.53710938, + "step": 4304, + "time_per_iteration": 3.0704357624053955 + }, + { + "auxiliary_loss_clip": 0.01483743, + "auxiliary_loss_mlp": 0.01205391, + "balance_loss_clip": 1.12729478, + "balance_loss_mlp": 0.9978714, + "epoch": 0.5176456442012866, + "flos": 68557274298720.0, + "grad_norm": 0.7814410689961143, + "language_loss": 0.58727843, + "learning_rate": 1.9828628334697343e-06, + "loss": 0.61416978, + "num_input_tokens_seen": 92945040, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.078125, + "step": 4305, + "time_per_iteration": 3.6303884983062744 + }, + { + "auxiliary_loss_clip": 0.01483176, + "auxiliary_loss_mlp": 0.01211128, + "balance_loss_clip": 1.12660742, + "balance_loss_mlp": 1.00131989, + "epoch": 0.5177658870919257, + "flos": 64090620089280.0, + "grad_norm": 0.7642954947539997, + "language_loss": 0.54616976, + "learning_rate": 1.982083891738784e-06, + "loss": 0.57311279, + "num_input_tokens_seen": 93005910, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.1015625, + "step": 4306, + "time_per_iteration": 3.476562261581421 + }, + { + "auxiliary_loss_clip": 0.01509668, + "auxiliary_loss_mlp": 0.01276111, + "balance_loss_clip": 1.14605904, + "balance_loss_mlp": 1.02510357, + "epoch": 0.5178861299825648, + "flos": 26653673543040.0, + "grad_norm": 2.0724476517450867, + "language_loss": 0.82646358, + "learning_rate": 1.9813049527256923e-06, + "loss": 0.85432136, + "num_input_tokens_seen": 93026305, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.5078125, + "step": 4307, + "time_per_iteration": 3.0711829662323 + }, + { + "auxiliary_loss_clip": 0.01501493, + "auxiliary_loss_mlp": 0.01279558, + "balance_loss_clip": 1.13679242, + "balance_loss_mlp": 1.02340126, + "epoch": 0.5180063728732038, + "flos": 17933864558400.0, + "grad_norm": 2.8262811440927593, + "language_loss": 0.81747806, + "learning_rate": 1.9805260165486252e-06, + "loss": 0.84528857, + "num_input_tokens_seen": 93045675, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.56054688, + "step": 4308, + "time_per_iteration": 3.0561578273773193 + }, + { + "auxiliary_loss_clip": 0.01510027, + "auxiliary_loss_mlp": 0.01274089, + "balance_loss_clip": 1.14743304, + "balance_loss_mlp": 1.02022076, + "epoch": 0.518126615763843, + "flos": 19502968203360.0, + "grad_norm": 2.5472749373544827, + "language_loss": 0.86549395, + "learning_rate": 1.9797470833257457e-06, + "loss": 0.8933351, + "num_input_tokens_seen": 93065375, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.53710938, + "step": 4309, + "time_per_iteration": 3.055640697479248 + }, + { + "auxiliary_loss_clip": 0.01510499, + "auxiliary_loss_mlp": 0.01283428, + "balance_loss_clip": 1.14859009, + "balance_loss_mlp": 1.02746165, + "epoch": 0.5182468586544821, + "flos": 20706175245600.0, + "grad_norm": 2.623366282010283, + "language_loss": 0.77460396, + "learning_rate": 1.9789681531752177e-06, + "loss": 0.80254322, + "num_input_tokens_seen": 93085595, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.55859375, + "step": 4310, + "time_per_iteration": 3.0334343910217285 + }, + { + "auxiliary_loss_clip": 0.01509334, + "auxiliary_loss_mlp": 0.01272355, + "balance_loss_clip": 1.14532316, + "balance_loss_mlp": 1.01982188, + "epoch": 0.5183671015451211, + "flos": 23114334025440.0, + "grad_norm": 1.7851520110403367, + "language_loss": 0.73002648, + "learning_rate": 1.978189226215204e-06, + "loss": 0.75784338, + "num_input_tokens_seen": 93106140, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.5234375, + "step": 4311, + "time_per_iteration": 3.044079542160034 + }, + { + "auxiliary_loss_clip": 0.01512738, + "auxiliary_loss_mlp": 0.01283155, + "balance_loss_clip": 1.1483779, + "balance_loss_mlp": 1.02757001, + "epoch": 0.5184873444357603, + "flos": 17598690129600.0, + "grad_norm": 2.148058693196406, + "language_loss": 0.77061343, + "learning_rate": 1.9774103025638675e-06, + "loss": 0.7985723, + "num_input_tokens_seen": 93124265, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.5546875, + "step": 4312, + "time_per_iteration": 3.0869812965393066 + }, + { + "auxiliary_loss_clip": 0.0151203, + "auxiliary_loss_mlp": 0.01287072, + "balance_loss_clip": 1.14829326, + "balance_loss_mlp": 1.03091502, + "epoch": 0.5186075873263993, + "flos": 24939151875360.0, + "grad_norm": 1.646133896814042, + "language_loss": 0.76563871, + "learning_rate": 1.9766313823393696e-06, + "loss": 0.79362977, + "num_input_tokens_seen": 93145130, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.56054688, + "step": 4313, + "time_per_iteration": 3.0610601902008057 + }, + { + "auxiliary_loss_clip": 0.01506564, + "auxiliary_loss_mlp": 0.01273327, + "balance_loss_clip": 1.14217365, + "balance_loss_mlp": 1.01888704, + "epoch": 0.5187278302170384, + "flos": 15191631266400.0, + "grad_norm": 2.431881985980803, + "language_loss": 0.68918943, + "learning_rate": 1.975852465659873e-06, + "loss": 0.71698833, + "num_input_tokens_seen": 93161110, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.54296875, + "step": 4314, + "time_per_iteration": 3.1506645679473877 + }, + { + "auxiliary_loss_clip": 0.01512351, + "auxiliary_loss_mlp": 0.01285056, + "balance_loss_clip": 1.14970779, + "balance_loss_mlp": 1.0279454, + "epoch": 0.5188480731076776, + "flos": 25011747102240.0, + "grad_norm": 6.200615706448931, + "language_loss": 0.7038871, + "learning_rate": 1.9750735526435377e-06, + "loss": 0.73186117, + "num_input_tokens_seen": 93178055, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.5703125, + "step": 4315, + "time_per_iteration": 3.1050801277160645 + }, + { + "auxiliary_loss_clip": 0.01507272, + "auxiliary_loss_mlp": 0.01288916, + "balance_loss_clip": 1.1440177, + "balance_loss_mlp": 1.03256834, + "epoch": 0.5189683159983166, + "flos": 24792444295200.0, + "grad_norm": 6.102802455623138, + "language_loss": 0.79301047, + "learning_rate": 1.974294643408525e-06, + "loss": 0.82097232, + "num_input_tokens_seen": 93195850, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.5625, + "step": 4316, + "time_per_iteration": 3.155944585800171 + }, + { + "auxiliary_loss_clip": 0.01510447, + "auxiliary_loss_mlp": 0.01289801, + "balance_loss_clip": 1.14664137, + "balance_loss_mlp": 1.03173685, + "epoch": 0.5190885588889557, + "flos": 24756601819680.0, + "grad_norm": 4.108017848411578, + "language_loss": 0.67383581, + "learning_rate": 1.9735157380729947e-06, + "loss": 0.70183831, + "num_input_tokens_seen": 93216260, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.58007812, + "step": 4317, + "time_per_iteration": 3.080538272857666 + }, + { + "auxiliary_loss_clip": 0.01508944, + "auxiliary_loss_mlp": 0.01285822, + "balance_loss_clip": 1.14566445, + "balance_loss_mlp": 1.03195405, + "epoch": 0.5192088017795948, + "flos": 24714690838560.0, + "grad_norm": 2.34561693881621, + "language_loss": 0.84681743, + "learning_rate": 1.9727368367551053e-06, + "loss": 0.8747651, + "num_input_tokens_seen": 93234810, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.53710938, + "step": 4318, + "time_per_iteration": 2.9882700443267822 + }, + { + "auxiliary_loss_clip": 0.01506726, + "auxiliary_loss_mlp": 0.01279565, + "balance_loss_clip": 1.14329171, + "balance_loss_mlp": 1.0260787, + "epoch": 0.5193290446702339, + "flos": 27231742458720.0, + "grad_norm": 3.1212670185355984, + "language_loss": 0.68542087, + "learning_rate": 1.9719579395730164e-06, + "loss": 0.71328384, + "num_input_tokens_seen": 93254185, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.53320312, + "step": 4319, + "time_per_iteration": 3.0563876628875732 + }, + { + "auxiliary_loss_clip": 0.01515612, + "auxiliary_loss_mlp": 0.01286114, + "balance_loss_clip": 1.15167356, + "balance_loss_mlp": 1.03129196, + "epoch": 0.5194492875608729, + "flos": 11474937851040.0, + "grad_norm": 2.1891914113201136, + "language_loss": 0.93508625, + "learning_rate": 1.9711790466448854e-06, + "loss": 0.96310347, + "num_input_tokens_seen": 93268205, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.546875, + "step": 4320, + "time_per_iteration": 2.9640960693359375 + }, + { + "auxiliary_loss_clip": 0.01512879, + "auxiliary_loss_mlp": 0.01286695, + "balance_loss_clip": 1.14857233, + "balance_loss_mlp": 1.02843976, + "epoch": 0.5195695304515121, + "flos": 20340961349760.0, + "grad_norm": 2.6057899794947006, + "language_loss": 0.71412432, + "learning_rate": 1.9704001580888704e-06, + "loss": 0.74212003, + "num_input_tokens_seen": 93286945, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.58203125, + "step": 4321, + "time_per_iteration": 3.031116247177124 + }, + { + "auxiliary_loss_clip": 0.015077, + "auxiliary_loss_mlp": 0.01273367, + "balance_loss_clip": 1.14588261, + "balance_loss_mlp": 1.01969004, + "epoch": 0.5196897733421512, + "flos": 20050428729600.0, + "grad_norm": 1.942347528396902, + "language_loss": 0.86598068, + "learning_rate": 1.9696212740231283e-06, + "loss": 0.89379138, + "num_input_tokens_seen": 93305595, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.53515625, + "step": 4322, + "time_per_iteration": 3.8994803428649902 + }, + { + "auxiliary_loss_clip": 0.01510769, + "auxiliary_loss_mlp": 0.0129, + "balance_loss_clip": 1.14612663, + "balance_loss_mlp": 1.03136396, + "epoch": 0.5198100162327902, + "flos": 23807781496800.0, + "grad_norm": 2.304303432043394, + "language_loss": 0.82494956, + "learning_rate": 1.9688423945658146e-06, + "loss": 0.85295725, + "num_input_tokens_seen": 93326460, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.5859375, + "step": 4323, + "time_per_iteration": 3.8554985523223877 + }, + { + "auxiliary_loss_clip": 0.01513037, + "auxiliary_loss_mlp": 0.01289627, + "balance_loss_clip": 1.14889455, + "balance_loss_mlp": 1.03213549, + "epoch": 0.5199302591234293, + "flos": 24026211956160.0, + "grad_norm": 3.0640682917569664, + "language_loss": 0.7238912, + "learning_rate": 1.9680635198350845e-06, + "loss": 0.75191784, + "num_input_tokens_seen": 93346170, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.57421875, + "step": 4324, + "time_per_iteration": 3.00398850440979 + }, + { + "auxiliary_loss_clip": 0.01517753, + "auxiliary_loss_mlp": 0.01281651, + "balance_loss_clip": 1.15358233, + "balance_loss_mlp": 1.02339649, + "epoch": 0.5200505020140684, + "flos": 26361775509120.0, + "grad_norm": 2.033790390780928, + "language_loss": 0.72801769, + "learning_rate": 1.967284649949093e-06, + "loss": 0.75601172, + "num_input_tokens_seen": 93365380, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.58203125, + "step": 4325, + "time_per_iteration": 3.1704869270324707 + }, + { + "auxiliary_loss_clip": 0.01508154, + "auxiliary_loss_mlp": 0.01279822, + "balance_loss_clip": 1.14379573, + "balance_loss_mlp": 1.02633548, + "epoch": 0.5201707449047075, + "flos": 39607445289600.0, + "grad_norm": 2.200573263509627, + "language_loss": 0.72180825, + "learning_rate": 1.966505785025994e-06, + "loss": 0.74968803, + "num_input_tokens_seen": 93387285, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.53320312, + "step": 4326, + "time_per_iteration": 4.041574001312256 + }, + { + "auxiliary_loss_clip": 0.01510691, + "auxiliary_loss_mlp": 0.01279874, + "balance_loss_clip": 1.14696336, + "balance_loss_mlp": 1.0252434, + "epoch": 0.5202909877953465, + "flos": 53686480980960.0, + "grad_norm": 2.1013638983396596, + "language_loss": 0.76179445, + "learning_rate": 1.965726925183941e-06, + "loss": 0.78970009, + "num_input_tokens_seen": 93410390, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.54492188, + "step": 4327, + "time_per_iteration": 3.249784469604492 + }, + { + "auxiliary_loss_clip": 0.01513761, + "auxiliary_loss_mlp": 0.01278867, + "balance_loss_clip": 1.14817202, + "balance_loss_mlp": 1.02557111, + "epoch": 0.5204112306859857, + "flos": 19539076176000.0, + "grad_norm": 1.9199808985910092, + "language_loss": 0.84597015, + "learning_rate": 1.964948070541087e-06, + "loss": 0.87389648, + "num_input_tokens_seen": 93429050, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.53125, + "step": 4328, + "time_per_iteration": 3.02378511428833 + }, + { + "auxiliary_loss_clip": 0.01518007, + "auxiliary_loss_mlp": 0.01273079, + "balance_loss_clip": 1.1531775, + "balance_loss_mlp": 1.02035522, + "epoch": 0.5205314735766248, + "flos": 15306516756000.0, + "grad_norm": 2.4782466908497516, + "language_loss": 0.69290912, + "learning_rate": 1.9641692212155816e-06, + "loss": 0.72082001, + "num_input_tokens_seen": 93446815, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.52539062, + "step": 4329, + "time_per_iteration": 3.0442473888397217 + }, + { + "auxiliary_loss_clip": 0.01522972, + "auxiliary_loss_mlp": 0.01276807, + "balance_loss_clip": 1.15968764, + "balance_loss_mlp": 1.02351153, + "epoch": 0.5206517164672638, + "flos": 59267627537760.0, + "grad_norm": 3.339980363430922, + "language_loss": 0.72664052, + "learning_rate": 1.9633903773255777e-06, + "loss": 0.75463831, + "num_input_tokens_seen": 93469130, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.53125, + "step": 4330, + "time_per_iteration": 4.16742467880249 + }, + { + "auxiliary_loss_clip": 0.01515645, + "auxiliary_loss_mlp": 0.0127783, + "balance_loss_clip": 1.1509701, + "balance_loss_mlp": 1.0241524, + "epoch": 0.520771959357903, + "flos": 26873621128800.0, + "grad_norm": 2.1630622681836305, + "language_loss": 0.74701524, + "learning_rate": 1.9626115389892237e-06, + "loss": 0.77494991, + "num_input_tokens_seen": 93489920, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.53515625, + "step": 4331, + "time_per_iteration": 3.0869271755218506 + }, + { + "auxiliary_loss_clip": 0.01518699, + "auxiliary_loss_mlp": 0.01278195, + "balance_loss_clip": 1.1532352, + "balance_loss_mlp": 1.0237546, + "epoch": 0.520892202248542, + "flos": 26909539460640.0, + "grad_norm": 3.2497302996791726, + "language_loss": 0.85431522, + "learning_rate": 1.96183270632467e-06, + "loss": 0.88228416, + "num_input_tokens_seen": 93509770, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.54296875, + "step": 4332, + "time_per_iteration": 3.2284083366394043 + }, + { + "auxiliary_loss_clip": 0.01513341, + "auxiliary_loss_mlp": 0.01284171, + "balance_loss_clip": 1.14646637, + "balance_loss_mlp": 1.02801359, + "epoch": 0.5210124451391811, + "flos": 25851864225600.0, + "grad_norm": 1.899329429625902, + "language_loss": 0.79177451, + "learning_rate": 1.9610538794500644e-06, + "loss": 0.81974959, + "num_input_tokens_seen": 93529320, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 2.56054688, + "step": 4333, + "time_per_iteration": 3.0796427726745605 + }, + { + "auxiliary_loss_clip": 0.01516145, + "auxiliary_loss_mlp": 0.01195503, + "balance_loss_clip": 1.15930867, + "balance_loss_mlp": 0.98950958, + "epoch": 0.5211326880298203, + "flos": 70561304526240.0, + "grad_norm": 0.8068153945900189, + "language_loss": 0.59377754, + "learning_rate": 1.9602750584835542e-06, + "loss": 0.62089407, + "num_input_tokens_seen": 93595255, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0625, + "step": 4334, + "time_per_iteration": 3.573077917098999 + }, + { + "auxiliary_loss_clip": 0.01533104, + "auxiliary_loss_mlp": 0.01275872, + "balance_loss_clip": 1.17105722, + "balance_loss_mlp": 1.02429247, + "epoch": 0.5212529309204593, + "flos": 15630502377600.0, + "grad_norm": 2.6224268719592194, + "language_loss": 0.82493377, + "learning_rate": 1.959496243543286e-06, + "loss": 0.85302347, + "num_input_tokens_seen": 93613135, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.51367188, + "step": 4335, + "time_per_iteration": 3.0962727069854736 + }, + { + "auxiliary_loss_clip": 0.0154118, + "auxiliary_loss_mlp": 0.01291926, + "balance_loss_clip": 1.17868495, + "balance_loss_mlp": 1.03653228, + "epoch": 0.5213731738110984, + "flos": 26244652258080.0, + "grad_norm": 2.514902132824047, + "language_loss": 0.79444838, + "learning_rate": 1.9587174347474057e-06, + "loss": 0.82277948, + "num_input_tokens_seen": 93629645, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.55273438, + "step": 4336, + "time_per_iteration": 3.0369579792022705 + }, + { + "auxiliary_loss_clip": 0.01529169, + "auxiliary_loss_mlp": 0.01289806, + "balance_loss_clip": 1.16463101, + "balance_loss_mlp": 1.03212368, + "epoch": 0.5214934167017375, + "flos": 19419942732480.0, + "grad_norm": 2.4700965798652876, + "language_loss": 0.82582253, + "learning_rate": 1.9579386322140574e-06, + "loss": 0.85401231, + "num_input_tokens_seen": 93645325, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.57617188, + "step": 4337, + "time_per_iteration": 3.065953016281128 + }, + { + "auxiliary_loss_clip": 0.01534006, + "auxiliary_loss_mlp": 0.01290887, + "balance_loss_clip": 1.17154813, + "balance_loss_mlp": 1.03606534, + "epoch": 0.5216136595923766, + "flos": 30958410980160.0, + "grad_norm": 2.1952050587157754, + "language_loss": 0.80921113, + "learning_rate": 1.9571598360613854e-06, + "loss": 0.8374601, + "num_input_tokens_seen": 93668200, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.546875, + "step": 4338, + "time_per_iteration": 3.0255377292633057 + }, + { + "auxiliary_loss_clip": 0.01534613, + "auxiliary_loss_mlp": 0.01292844, + "balance_loss_clip": 1.17249799, + "balance_loss_mlp": 1.03878558, + "epoch": 0.5217339024830157, + "flos": 21947234955840.0, + "grad_norm": 3.179142270181552, + "language_loss": 0.70045531, + "learning_rate": 1.956381046407532e-06, + "loss": 0.72872984, + "num_input_tokens_seen": 93688495, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.5390625, + "step": 4339, + "time_per_iteration": 3.0079288482666016 + }, + { + "auxiliary_loss_clip": 0.01531243, + "auxiliary_loss_mlp": 0.01293997, + "balance_loss_clip": 1.16969788, + "balance_loss_mlp": 1.03917551, + "epoch": 0.5218541453736548, + "flos": 20925326340000.0, + "grad_norm": 2.207199697484757, + "language_loss": 0.8627879, + "learning_rate": 1.9556022633706394e-06, + "loss": 0.89104033, + "num_input_tokens_seen": 93707285, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.546875, + "step": 4340, + "time_per_iteration": 2.9916858673095703 + }, + { + "auxiliary_loss_clip": 0.01534916, + "auxiliary_loss_mlp": 0.01280532, + "balance_loss_clip": 1.17187119, + "balance_loss_mlp": 1.02532923, + "epoch": 0.5219743882642939, + "flos": 23953958082720.0, + "grad_norm": 1.7771755799943125, + "language_loss": 0.79576921, + "learning_rate": 1.954823487068848e-06, + "loss": 0.82392371, + "num_input_tokens_seen": 93727495, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.55078125, + "step": 4341, + "time_per_iteration": 3.0123484134674072 + }, + { + "auxiliary_loss_clip": 0.01534071, + "auxiliary_loss_mlp": 0.01288248, + "balance_loss_clip": 1.17237914, + "balance_loss_mlp": 1.03304482, + "epoch": 0.5220946311549329, + "flos": 28801566738720.0, + "grad_norm": 2.5603722775491455, + "language_loss": 0.80955219, + "learning_rate": 1.9540447176202976e-06, + "loss": 0.83777535, + "num_input_tokens_seen": 93748740, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.55078125, + "step": 4342, + "time_per_iteration": 3.0892937183380127 + }, + { + "auxiliary_loss_clip": 0.01526591, + "auxiliary_loss_mlp": 0.01218895, + "balance_loss_clip": 1.17079329, + "balance_loss_mlp": 1.01290131, + "epoch": 0.5222148740455721, + "flos": 67196246725440.0, + "grad_norm": 4.184223371294523, + "language_loss": 0.60651278, + "learning_rate": 1.9532659551431272e-06, + "loss": 0.63396764, + "num_input_tokens_seen": 93815770, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0625, + "step": 4343, + "time_per_iteration": 3.5959548950195312 + }, + { + "auxiliary_loss_clip": 0.01532145, + "auxiliary_loss_mlp": 0.0127767, + "balance_loss_clip": 1.16936398, + "balance_loss_mlp": 1.02437437, + "epoch": 0.5223351169362112, + "flos": 61860346565760.0, + "grad_norm": 2.0317738591109675, + "language_loss": 0.67508543, + "learning_rate": 1.9524871997554744e-06, + "loss": 0.70318353, + "num_input_tokens_seen": 93843530, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.53125, + "step": 4344, + "time_per_iteration": 3.307450771331787 + }, + { + "auxiliary_loss_clip": 0.01531883, + "auxiliary_loss_mlp": 0.01273739, + "balance_loss_clip": 1.16877651, + "balance_loss_mlp": 1.02025259, + "epoch": 0.5224553598268502, + "flos": 14649139329120.0, + "grad_norm": 3.6355031217011784, + "language_loss": 0.80715358, + "learning_rate": 1.951708451575475e-06, + "loss": 0.83520985, + "num_input_tokens_seen": 93860595, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.53320312, + "step": 4345, + "time_per_iteration": 3.2582316398620605 + }, + { + "auxiliary_loss_clip": 0.01532939, + "auxiliary_loss_mlp": 0.0127985, + "balance_loss_clip": 1.16923225, + "balance_loss_mlp": 1.02560008, + "epoch": 0.5225756027174894, + "flos": 14827593143520.0, + "grad_norm": 3.6668047158564803, + "language_loss": 0.82405776, + "learning_rate": 1.9509297107212657e-06, + "loss": 0.85218567, + "num_input_tokens_seen": 93877365, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.54101562, + "step": 4346, + "time_per_iteration": 3.144028663635254 + }, + { + "auxiliary_loss_clip": 0.01530022, + "auxiliary_loss_mlp": 0.01276034, + "balance_loss_clip": 1.16725838, + "balance_loss_mlp": 1.02064061, + "epoch": 0.5226958456081284, + "flos": 23514176695680.0, + "grad_norm": 2.9944930547896806, + "language_loss": 0.79531014, + "learning_rate": 1.95015097731098e-06, + "loss": 0.82337075, + "num_input_tokens_seen": 93896855, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.55273438, + "step": 4347, + "time_per_iteration": 3.255688428878784 + }, + { + "auxiliary_loss_clip": 0.01526035, + "auxiliary_loss_mlp": 0.01297588, + "balance_loss_clip": 1.16257882, + "balance_loss_mlp": 1.04162192, + "epoch": 0.5228160884987675, + "flos": 19064969439840.0, + "grad_norm": 3.543391801234069, + "language_loss": 0.81711793, + "learning_rate": 1.949372251462751e-06, + "loss": 0.8453542, + "num_input_tokens_seen": 93914270, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.55859375, + "step": 4348, + "time_per_iteration": 3.100407123565674 + }, + { + "auxiliary_loss_clip": 0.01526446, + "auxiliary_loss_mlp": 0.01276748, + "balance_loss_clip": 1.16398597, + "balance_loss_mlp": 1.02440536, + "epoch": 0.5229363313894067, + "flos": 21065017210560.0, + "grad_norm": 1.9756502579960376, + "language_loss": 0.82866335, + "learning_rate": 1.9485935332947124e-06, + "loss": 0.85669523, + "num_input_tokens_seen": 93932180, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.52148438, + "step": 4349, + "time_per_iteration": 3.0475220680236816 + }, + { + "auxiliary_loss_clip": 0.01520188, + "auxiliary_loss_mlp": 0.01282046, + "balance_loss_clip": 1.15732241, + "balance_loss_mlp": 1.02913141, + "epoch": 0.5230565742800457, + "flos": 14832182450880.0, + "grad_norm": 2.4316098474540433, + "language_loss": 0.83518171, + "learning_rate": 1.947814822924993e-06, + "loss": 0.863204, + "num_input_tokens_seen": 93949690, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.52734375, + "step": 4350, + "time_per_iteration": 3.932255744934082 + }, + { + "auxiliary_loss_clip": 0.0153241, + "auxiliary_loss_mlp": 0.01283238, + "balance_loss_clip": 1.1697681, + "balance_loss_mlp": 1.03013229, + "epoch": 0.5231768171706848, + "flos": 25815452827680.0, + "grad_norm": 3.2599251312438717, + "language_loss": 0.82935256, + "learning_rate": 1.9470361204717236e-06, + "loss": 0.85750902, + "num_input_tokens_seen": 93968830, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.52929688, + "step": 4351, + "time_per_iteration": 3.055840253829956 + }, + { + "auxiliary_loss_clip": 0.01525414, + "auxiliary_loss_mlp": 0.01294206, + "balance_loss_clip": 1.16075516, + "balance_loss_mlp": 1.03823972, + "epoch": 0.5232970600613239, + "flos": 22745820379680.0, + "grad_norm": 1.8606849323260157, + "language_loss": 0.80846477, + "learning_rate": 1.9462574260530326e-06, + "loss": 0.83666098, + "num_input_tokens_seen": 93989110, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.55859375, + "step": 4352, + "time_per_iteration": 2.9752936363220215 + }, + { + "auxiliary_loss_clip": 0.01528257, + "auxiliary_loss_mlp": 0.01287109, + "balance_loss_clip": 1.16346574, + "balance_loss_mlp": 1.03266907, + "epoch": 0.523417302951963, + "flos": 17312595104160.0, + "grad_norm": 2.3567459923129013, + "language_loss": 0.80755806, + "learning_rate": 1.9454787397870472e-06, + "loss": 0.83571172, + "num_input_tokens_seen": 94006430, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.54296875, + "step": 4353, + "time_per_iteration": 3.775048017501831 + }, + { + "auxiliary_loss_clip": 0.01526143, + "auxiliary_loss_mlp": 0.01281637, + "balance_loss_clip": 1.16361594, + "balance_loss_mlp": 1.02700615, + "epoch": 0.523537545842602, + "flos": 18553730670720.0, + "grad_norm": 2.8391095867462828, + "language_loss": 0.71993846, + "learning_rate": 1.944700061791894e-06, + "loss": 0.74801624, + "num_input_tokens_seen": 94024825, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.54492188, + "step": 4354, + "time_per_iteration": 3.0080573558807373 + }, + { + "auxiliary_loss_clip": 0.01524924, + "auxiliary_loss_mlp": 0.01284245, + "balance_loss_clip": 1.16052127, + "balance_loss_mlp": 1.02789736, + "epoch": 0.5236577887332411, + "flos": 19721322806400.0, + "grad_norm": 2.2674082721164996, + "language_loss": 0.65525985, + "learning_rate": 1.943921392185698e-06, + "loss": 0.68335152, + "num_input_tokens_seen": 94043450, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.5625, + "step": 4355, + "time_per_iteration": 2.9665703773498535 + }, + { + "auxiliary_loss_clip": 0.01530732, + "auxiliary_loss_mlp": 0.01280735, + "balance_loss_clip": 1.16886401, + "balance_loss_mlp": 1.02820206, + "epoch": 0.5237780316238803, + "flos": 23552484501600.0, + "grad_norm": 2.668205086951728, + "language_loss": 0.77307177, + "learning_rate": 1.9431427310865814e-06, + "loss": 0.80118644, + "num_input_tokens_seen": 94063055, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.5234375, + "step": 4356, + "time_per_iteration": 3.0745060443878174 + }, + { + "auxiliary_loss_clip": 0.01525973, + "auxiliary_loss_mlp": 0.01288401, + "balance_loss_clip": 1.16130769, + "balance_loss_mlp": 1.03739357, + "epoch": 0.5238982745145193, + "flos": 22494088631520.0, + "grad_norm": 2.4442176270991913, + "language_loss": 0.78554803, + "learning_rate": 1.942364078612667e-06, + "loss": 0.81369174, + "num_input_tokens_seen": 94081785, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.5078125, + "step": 4357, + "time_per_iteration": 3.862362861633301 + }, + { + "auxiliary_loss_clip": 0.01527125, + "auxiliary_loss_mlp": 0.01285814, + "balance_loss_clip": 1.16378164, + "balance_loss_mlp": 1.03080106, + "epoch": 0.5240185174051584, + "flos": 27091368881280.0, + "grad_norm": 2.1654448624726483, + "language_loss": 0.75217736, + "learning_rate": 1.9415854348820765e-06, + "loss": 0.78030682, + "num_input_tokens_seen": 94101635, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.54882812, + "step": 4358, + "time_per_iteration": 3.0383286476135254 + }, + { + "auxiliary_loss_clip": 0.01528424, + "auxiliary_loss_mlp": 0.01290996, + "balance_loss_clip": 1.16451061, + "balance_loss_mlp": 1.0359838, + "epoch": 0.5241387602957975, + "flos": 22676904184320.0, + "grad_norm": 84.14780672094919, + "language_loss": 0.68389595, + "learning_rate": 1.940806800012929e-06, + "loss": 0.71209013, + "num_input_tokens_seen": 94121705, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.54882812, + "step": 4359, + "time_per_iteration": 2.984893321990967 + }, + { + "auxiliary_loss_clip": 0.0153321, + "auxiliary_loss_mlp": 0.01295082, + "balance_loss_clip": 1.16975951, + "balance_loss_mlp": 1.03625536, + "epoch": 0.5242590031864366, + "flos": 40556720750400.0, + "grad_norm": 1.8559675990422944, + "language_loss": 0.63712704, + "learning_rate": 1.9400281741233432e-06, + "loss": 0.66540992, + "num_input_tokens_seen": 94146595, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.58789062, + "step": 4360, + "time_per_iteration": 3.099083185195923 + }, + { + "auxiliary_loss_clip": 0.01529325, + "auxiliary_loss_mlp": 0.01214729, + "balance_loss_clip": 1.17278457, + "balance_loss_mlp": 1.00873566, + "epoch": 0.5243792460770756, + "flos": 66683035692000.0, + "grad_norm": 0.6674324805678898, + "language_loss": 0.52509493, + "learning_rate": 1.939249557331435e-06, + "loss": 0.55253547, + "num_input_tokens_seen": 94212410, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0625, + "step": 4361, + "time_per_iteration": 3.4546942710876465 + }, + { + "auxiliary_loss_clip": 0.01528729, + "auxiliary_loss_mlp": 0.01293463, + "balance_loss_clip": 1.16517997, + "balance_loss_mlp": 1.03959537, + "epoch": 0.5244994889677148, + "flos": 28186176149280.0, + "grad_norm": 11.584823376676518, + "language_loss": 0.7254113, + "learning_rate": 1.938470949755321e-06, + "loss": 0.75363314, + "num_input_tokens_seen": 94232290, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.53710938, + "step": 4362, + "time_per_iteration": 3.025385856628418 + }, + { + "auxiliary_loss_clip": 0.01532985, + "auxiliary_loss_mlp": 0.0121122, + "balance_loss_clip": 1.17619741, + "balance_loss_mlp": 1.00598907, + "epoch": 0.5246197318583539, + "flos": 65957273064000.0, + "grad_norm": 0.8204516029147625, + "language_loss": 0.556458, + "learning_rate": 1.937692351513115e-06, + "loss": 0.58390003, + "num_input_tokens_seen": 94291285, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0546875, + "step": 4363, + "time_per_iteration": 3.3359851837158203 + }, + { + "auxiliary_loss_clip": 0.01529356, + "auxiliary_loss_mlp": 0.01277744, + "balance_loss_clip": 1.16609251, + "balance_loss_mlp": 1.02292252, + "epoch": 0.5247399747489929, + "flos": 21035660450400.0, + "grad_norm": 1.6705470307604793, + "language_loss": 0.80644578, + "learning_rate": 1.9369137627229297e-06, + "loss": 0.83451676, + "num_input_tokens_seen": 94309685, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.546875, + "step": 4364, + "time_per_iteration": 3.0355584621429443 + }, + { + "auxiliary_loss_clip": 0.01529639, + "auxiliary_loss_mlp": 0.01271582, + "balance_loss_clip": 1.16624999, + "balance_loss_mlp": 1.01847661, + "epoch": 0.5248602176396321, + "flos": 19027951191360.0, + "grad_norm": 2.0793253550910995, + "language_loss": 0.88074458, + "learning_rate": 1.936135183502877e-06, + "loss": 0.90875679, + "num_input_tokens_seen": 94326985, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.52929688, + "step": 4365, + "time_per_iteration": 2.9753050804138184 + }, + { + "auxiliary_loss_clip": 0.01525626, + "auxiliary_loss_mlp": 0.01290366, + "balance_loss_clip": 1.16081762, + "balance_loss_mlp": 1.03745198, + "epoch": 0.5249804605302711, + "flos": 22202266453920.0, + "grad_norm": 2.411571583064117, + "language_loss": 0.80732048, + "learning_rate": 1.935356613971066e-06, + "loss": 0.83548039, + "num_input_tokens_seen": 94347645, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.52734375, + "step": 4366, + "time_per_iteration": 2.9962754249572754 + }, + { + "auxiliary_loss_clip": 0.01523821, + "auxiliary_loss_mlp": 0.01277881, + "balance_loss_clip": 1.1612004, + "balance_loss_mlp": 1.02229619, + "epoch": 0.5251007034209102, + "flos": 23807971137600.0, + "grad_norm": 2.0181023154112676, + "language_loss": 0.76863754, + "learning_rate": 1.9345780542456047e-06, + "loss": 0.79665458, + "num_input_tokens_seen": 94367020, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.5546875, + "step": 4367, + "time_per_iteration": 3.113501787185669 + }, + { + "auxiliary_loss_clip": 0.01523115, + "auxiliary_loss_mlp": 0.0126864, + "balance_loss_clip": 1.16006768, + "balance_loss_mlp": 1.01725161, + "epoch": 0.5252209463115494, + "flos": 23296846152960.0, + "grad_norm": 3.8005207053241676, + "language_loss": 0.71729088, + "learning_rate": 1.9337995044446007e-06, + "loss": 0.74520844, + "num_input_tokens_seen": 94385860, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.51171875, + "step": 4368, + "time_per_iteration": 3.0041332244873047 + }, + { + "auxiliary_loss_clip": 0.01529145, + "auxiliary_loss_mlp": 0.01274218, + "balance_loss_clip": 1.16624737, + "balance_loss_mlp": 1.02206683, + "epoch": 0.5253411892021884, + "flos": 19830746640960.0, + "grad_norm": 1.9697197045892414, + "language_loss": 0.80057883, + "learning_rate": 1.9330209646861596e-06, + "loss": 0.82861245, + "num_input_tokens_seen": 94405010, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.51953125, + "step": 4369, + "time_per_iteration": 3.04068660736084 + }, + { + "auxiliary_loss_clip": 0.01519065, + "auxiliary_loss_mlp": 0.01281089, + "balance_loss_clip": 1.15521884, + "balance_loss_mlp": 1.02855563, + "epoch": 0.5254614320928275, + "flos": 24136015072320.0, + "grad_norm": 1.7282201765869125, + "language_loss": 0.77658486, + "learning_rate": 1.9322424350883843e-06, + "loss": 0.80458641, + "num_input_tokens_seen": 94426845, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.5234375, + "step": 4370, + "time_per_iteration": 3.0990688800811768 + }, + { + "auxiliary_loss_clip": 0.01520454, + "auxiliary_loss_mlp": 0.01277523, + "balance_loss_clip": 1.15846968, + "balance_loss_mlp": 1.02479935, + "epoch": 0.5255816749834666, + "flos": 24647291769600.0, + "grad_norm": 1.8704963491597055, + "language_loss": 0.78956401, + "learning_rate": 1.931463915769379e-06, + "loss": 0.81754386, + "num_input_tokens_seen": 94446960, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.52539062, + "step": 4371, + "time_per_iteration": 2.987328290939331 + }, + { + "auxiliary_loss_clip": 0.01527742, + "auxiliary_loss_mlp": 0.01281969, + "balance_loss_clip": 1.16337156, + "balance_loss_mlp": 1.02790987, + "epoch": 0.5257019178741057, + "flos": 14138317769760.0, + "grad_norm": 2.2287553836648772, + "language_loss": 0.74022686, + "learning_rate": 1.930685406847242e-06, + "loss": 0.76832396, + "num_input_tokens_seen": 94461535, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.5390625, + "step": 4372, + "time_per_iteration": 2.9615375995635986 + }, + { + "auxiliary_loss_clip": 0.0151602, + "auxiliary_loss_mlp": 0.01275105, + "balance_loss_clip": 1.1535902, + "balance_loss_mlp": 1.02352595, + "epoch": 0.5258221607647448, + "flos": 23551157016000.0, + "grad_norm": 1.7153055141843538, + "language_loss": 0.81891316, + "learning_rate": 1.9299069084400734e-06, + "loss": 0.84682441, + "num_input_tokens_seen": 94482395, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.51367188, + "step": 4373, + "time_per_iteration": 3.1123642921447754 + }, + { + "auxiliary_loss_clip": 0.01530491, + "auxiliary_loss_mlp": 0.01282267, + "balance_loss_clip": 1.16759837, + "balance_loss_mlp": 1.02763641, + "epoch": 0.5259424036553839, + "flos": 24968053497600.0, + "grad_norm": 2.0139596244866302, + "language_loss": 0.69801289, + "learning_rate": 1.9291284206659717e-06, + "loss": 0.7261405, + "num_input_tokens_seen": 94500580, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.54492188, + "step": 4374, + "time_per_iteration": 3.0367655754089355 + }, + { + "auxiliary_loss_clip": 0.01529366, + "auxiliary_loss_mlp": 0.0128122, + "balance_loss_clip": 1.16733909, + "balance_loss_mlp": 1.02830589, + "epoch": 0.526062646546023, + "flos": 28766255257440.0, + "grad_norm": 1.9453013145369513, + "language_loss": 0.71631199, + "learning_rate": 1.928349943643032e-06, + "loss": 0.74441791, + "num_input_tokens_seen": 94519680, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.52734375, + "step": 4375, + "time_per_iteration": 3.116232395172119 + }, + { + "auxiliary_loss_clip": 0.0152646, + "auxiliary_loss_mlp": 0.01278436, + "balance_loss_clip": 1.16447747, + "balance_loss_mlp": 1.02761912, + "epoch": 0.526182889436662, + "flos": 22823649692640.0, + "grad_norm": 2.0033943027596175, + "language_loss": 0.81805372, + "learning_rate": 1.9275714774893493e-06, + "loss": 0.84610271, + "num_input_tokens_seen": 94539135, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.50585938, + "step": 4376, + "time_per_iteration": 2.978330612182617 + }, + { + "auxiliary_loss_clip": 0.01526931, + "auxiliary_loss_mlp": 0.01284629, + "balance_loss_clip": 1.16547966, + "balance_loss_mlp": 1.02961659, + "epoch": 0.5263031323273012, + "flos": 22931670185280.0, + "grad_norm": 2.3340940243354127, + "language_loss": 0.72620177, + "learning_rate": 1.9267930223230154e-06, + "loss": 0.75431728, + "num_input_tokens_seen": 94557610, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.54882812, + "step": 4377, + "time_per_iteration": 4.60008978843689 + }, + { + "auxiliary_loss_clip": 0.01525306, + "auxiliary_loss_mlp": 0.01278711, + "balance_loss_clip": 1.1638602, + "balance_loss_mlp": 1.02713156, + "epoch": 0.5264233752179402, + "flos": 17750555939520.0, + "grad_norm": 2.151514344725594, + "language_loss": 0.78122234, + "learning_rate": 1.9260145782621224e-06, + "loss": 0.80926251, + "num_input_tokens_seen": 94575390, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.51367188, + "step": 4378, + "time_per_iteration": 2.9685473442077637 + }, + { + "auxiliary_loss_clip": 0.01520819, + "auxiliary_loss_mlp": 0.01276151, + "balance_loss_clip": 1.15835071, + "balance_loss_mlp": 1.02438128, + "epoch": 0.5265436181085793, + "flos": 24423703080480.0, + "grad_norm": 2.3332105771632734, + "language_loss": 0.88280636, + "learning_rate": 1.925236145424758e-06, + "loss": 0.91077602, + "num_input_tokens_seen": 94594210, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.515625, + "step": 4379, + "time_per_iteration": 2.9981939792633057 + }, + { + "auxiliary_loss_clip": 0.01547509, + "auxiliary_loss_mlp": 0.01194771, + "balance_loss_clip": 1.19164765, + "balance_loss_mlp": 0.99182892, + "epoch": 0.5266638609992185, + "flos": 69214917222720.0, + "grad_norm": 0.7045095539451001, + "language_loss": 0.57545608, + "learning_rate": 1.924457723929012e-06, + "loss": 0.60287887, + "num_input_tokens_seen": 94665020, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.03125, + "step": 4380, + "time_per_iteration": 3.5893590450286865 + }, + { + "auxiliary_loss_clip": 0.01521417, + "auxiliary_loss_mlp": 0.01282739, + "balance_loss_clip": 1.16083324, + "balance_loss_mlp": 1.02925181, + "epoch": 0.5267841038898575, + "flos": 20740690235520.0, + "grad_norm": 1.9602677493018839, + "language_loss": 0.83266449, + "learning_rate": 1.9236793138929685e-06, + "loss": 0.86070609, + "num_input_tokens_seen": 94684290, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.53320312, + "step": 4381, + "time_per_iteration": 3.9492886066436768 + }, + { + "auxiliary_loss_clip": 0.01524577, + "auxiliary_loss_mlp": 0.0129011, + "balance_loss_clip": 1.16473949, + "balance_loss_mlp": 1.03471565, + "epoch": 0.5269043467804966, + "flos": 17236093276800.0, + "grad_norm": 3.5452055616032623, + "language_loss": 0.81085241, + "learning_rate": 1.9229009154347133e-06, + "loss": 0.83899921, + "num_input_tokens_seen": 94701880, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.55273438, + "step": 4382, + "time_per_iteration": 3.0808658599853516 + }, + { + "auxiliary_loss_clip": 0.01523998, + "auxiliary_loss_mlp": 0.0128267, + "balance_loss_clip": 1.16195285, + "balance_loss_mlp": 1.03223491, + "epoch": 0.5270245896711357, + "flos": 18225117813600.0, + "grad_norm": 2.370020448460418, + "language_loss": 0.80506116, + "learning_rate": 1.922122528672327e-06, + "loss": 0.83312786, + "num_input_tokens_seen": 94720545, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.50195312, + "step": 4383, + "time_per_iteration": 3.0359156131744385 + }, + { + "auxiliary_loss_clip": 0.01520527, + "auxiliary_loss_mlp": 0.01286111, + "balance_loss_clip": 1.15934074, + "balance_loss_mlp": 1.03510356, + "epoch": 0.5271448325617748, + "flos": 21289667888160.0, + "grad_norm": 3.5351040991009297, + "language_loss": 0.78190351, + "learning_rate": 1.9213441537238914e-06, + "loss": 0.8099699, + "num_input_tokens_seen": 94737420, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.5078125, + "step": 4384, + "time_per_iteration": 3.9371235370635986 + }, + { + "auxiliary_loss_clip": 0.0154374, + "auxiliary_loss_mlp": 0.01216919, + "balance_loss_clip": 1.18842196, + "balance_loss_mlp": 1.01397705, + "epoch": 0.5272650754524139, + "flos": 65501410137120.0, + "grad_norm": 0.8389530269985485, + "language_loss": 0.57360023, + "learning_rate": 1.920565790707485e-06, + "loss": 0.60120684, + "num_input_tokens_seen": 94802810, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.03125, + "step": 4385, + "time_per_iteration": 3.8332126140594482 + }, + { + "auxiliary_loss_clip": 0.01532899, + "auxiliary_loss_mlp": 0.0128572, + "balance_loss_clip": 1.17125177, + "balance_loss_mlp": 1.02918172, + "epoch": 0.527385318343053, + "flos": 19678122267840.0, + "grad_norm": 2.1969053232602596, + "language_loss": 0.66083336, + "learning_rate": 1.9197874397411853e-06, + "loss": 0.68901956, + "num_input_tokens_seen": 94819440, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.56445312, + "step": 4386, + "time_per_iteration": 3.05979323387146 + }, + { + "auxiliary_loss_clip": 0.01528677, + "auxiliary_loss_mlp": 0.01286645, + "balance_loss_clip": 1.16838431, + "balance_loss_mlp": 1.03105998, + "epoch": 0.5275055612336921, + "flos": 12713570159040.0, + "grad_norm": 5.09906940261607, + "language_loss": 0.66008639, + "learning_rate": 1.919009100943067e-06, + "loss": 0.68823957, + "num_input_tokens_seen": 94835130, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.5546875, + "step": 4387, + "time_per_iteration": 3.028282403945923 + }, + { + "auxiliary_loss_clip": 0.01525032, + "auxiliary_loss_mlp": 0.01298959, + "balance_loss_clip": 1.16517019, + "balance_loss_mlp": 1.0428021, + "epoch": 0.5276258041243311, + "flos": 17750973149280.0, + "grad_norm": 2.8854667137412715, + "language_loss": 0.66290498, + "learning_rate": 1.9182307744312043e-06, + "loss": 0.69114488, + "num_input_tokens_seen": 94852235, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.56054688, + "step": 4388, + "time_per_iteration": 3.0547683238983154 + }, + { + "auxiliary_loss_clip": 0.01523387, + "auxiliary_loss_mlp": 0.01279241, + "balance_loss_clip": 1.16251683, + "balance_loss_mlp": 1.02728045, + "epoch": 0.5277460470149702, + "flos": 22712632875360.0, + "grad_norm": 2.0643589705296272, + "language_loss": 0.76426196, + "learning_rate": 1.9174524603236676e-06, + "loss": 0.79228824, + "num_input_tokens_seen": 94871185, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.51757812, + "step": 4389, + "time_per_iteration": 3.0948100090026855 + }, + { + "auxiliary_loss_clip": 0.01523833, + "auxiliary_loss_mlp": 0.01281499, + "balance_loss_clip": 1.16447628, + "balance_loss_mlp": 1.02553296, + "epoch": 0.5278662899056094, + "flos": 19904821066080.0, + "grad_norm": 2.147874158234256, + "language_loss": 0.75733924, + "learning_rate": 1.916674158738527e-06, + "loss": 0.78539258, + "num_input_tokens_seen": 94890090, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.55859375, + "step": 4390, + "time_per_iteration": 3.0770506858825684 + }, + { + "auxiliary_loss_clip": 0.01523144, + "auxiliary_loss_mlp": 0.0128697, + "balance_loss_clip": 1.16057253, + "balance_loss_mlp": 1.0285244, + "epoch": 0.5279865327962484, + "flos": 18007218348480.0, + "grad_norm": 2.1148891754767516, + "language_loss": 0.60778129, + "learning_rate": 1.9158958697938506e-06, + "loss": 0.63588244, + "num_input_tokens_seen": 94908470, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.58398438, + "step": 4391, + "time_per_iteration": 2.9135360717773438 + }, + { + "auxiliary_loss_clip": 0.01527373, + "auxiliary_loss_mlp": 0.01281767, + "balance_loss_clip": 1.16638923, + "balance_loss_mlp": 1.02980661, + "epoch": 0.5281067756868875, + "flos": 15926269083840.0, + "grad_norm": 3.4551263232584, + "language_loss": 0.86132532, + "learning_rate": 1.9151175936077032e-06, + "loss": 0.88941669, + "num_input_tokens_seen": 94923440, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.51757812, + "step": 4392, + "time_per_iteration": 3.0638790130615234 + }, + { + "auxiliary_loss_clip": 0.0152413, + "auxiliary_loss_mlp": 0.01297856, + "balance_loss_clip": 1.16342163, + "balance_loss_mlp": 1.04246187, + "epoch": 0.5282270185775266, + "flos": 19428476568480.0, + "grad_norm": 1.6610151115992131, + "language_loss": 0.79633451, + "learning_rate": 1.9143393302981507e-06, + "loss": 0.82455438, + "num_input_tokens_seen": 94941125, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.55273438, + "step": 4393, + "time_per_iteration": 3.0384914875030518 + }, + { + "auxiliary_loss_clip": 0.01525543, + "auxiliary_loss_mlp": 0.01291475, + "balance_loss_clip": 1.16534913, + "balance_loss_mlp": 1.03703427, + "epoch": 0.5283472614681657, + "flos": 16401513664800.0, + "grad_norm": 1.7310010012607493, + "language_loss": 0.83441502, + "learning_rate": 1.913561079983252e-06, + "loss": 0.86258519, + "num_input_tokens_seen": 94959950, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.54296875, + "step": 4394, + "time_per_iteration": 2.9286131858825684 + }, + { + "auxiliary_loss_clip": 0.01525677, + "auxiliary_loss_mlp": 0.01289811, + "balance_loss_clip": 1.16617346, + "balance_loss_mlp": 1.03308177, + "epoch": 0.5284675043588047, + "flos": 26762945664960.0, + "grad_norm": 2.5102647316434195, + "language_loss": 0.7517724, + "learning_rate": 1.9127828427810693e-06, + "loss": 0.77992725, + "num_input_tokens_seen": 94980515, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.56640625, + "step": 4395, + "time_per_iteration": 3.1017401218414307 + }, + { + "auxiliary_loss_clip": 0.01524348, + "auxiliary_loss_mlp": 0.01290198, + "balance_loss_clip": 1.16444743, + "balance_loss_mlp": 1.03499448, + "epoch": 0.5285877472494439, + "flos": 19901862669600.0, + "grad_norm": 2.217673773822656, + "language_loss": 0.81005287, + "learning_rate": 1.9120046188096607e-06, + "loss": 0.8381983, + "num_input_tokens_seen": 94998560, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.55078125, + "step": 4396, + "time_per_iteration": 3.0677382946014404 + }, + { + "auxiliary_loss_clip": 0.01529342, + "auxiliary_loss_mlp": 0.0127492, + "balance_loss_clip": 1.17103267, + "balance_loss_mlp": 1.02238703, + "epoch": 0.528707990140083, + "flos": 20013334624800.0, + "grad_norm": 2.040104851859065, + "language_loss": 0.74095428, + "learning_rate": 1.9112264081870804e-06, + "loss": 0.76899695, + "num_input_tokens_seen": 95016950, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.5234375, + "step": 4397, + "time_per_iteration": 2.9354212284088135 + }, + { + "auxiliary_loss_clip": 0.01522747, + "auxiliary_loss_mlp": 0.01284289, + "balance_loss_clip": 1.16323471, + "balance_loss_mlp": 1.02908564, + "epoch": 0.528828233030722, + "flos": 20670067272960.0, + "grad_norm": 2.4637594450847806, + "language_loss": 0.75774825, + "learning_rate": 1.9104482110313843e-06, + "loss": 0.78581858, + "num_input_tokens_seen": 95036540, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.55078125, + "step": 4398, + "time_per_iteration": 3.010820150375366 + }, + { + "auxiliary_loss_clip": 0.0152067, + "auxiliary_loss_mlp": 0.01282647, + "balance_loss_clip": 1.16074955, + "balance_loss_mlp": 1.03087735, + "epoch": 0.5289484759213612, + "flos": 25194828152160.0, + "grad_norm": 1.9759031211366263, + "language_loss": 0.74365497, + "learning_rate": 1.909670027460623e-06, + "loss": 0.77168816, + "num_input_tokens_seen": 95053840, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.515625, + "step": 4399, + "time_per_iteration": 2.9368700981140137 + }, + { + "auxiliary_loss_clip": 0.01528283, + "auxiliary_loss_mlp": 0.01274199, + "balance_loss_clip": 1.16866851, + "balance_loss_mlp": 1.01975906, + "epoch": 0.5290687188120002, + "flos": 31141947168000.0, + "grad_norm": 4.061563220624084, + "language_loss": 0.71754134, + "learning_rate": 1.908891857592847e-06, + "loss": 0.74556613, + "num_input_tokens_seen": 95074910, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.54296875, + "step": 4400, + "time_per_iteration": 3.068770408630371 + }, + { + "auxiliary_loss_clip": 0.01522126, + "auxiliary_loss_mlp": 0.01286446, + "balance_loss_clip": 1.16286242, + "balance_loss_mlp": 1.03505778, + "epoch": 0.5291889617026393, + "flos": 20121886111680.0, + "grad_norm": 2.304284498842648, + "language_loss": 0.89836121, + "learning_rate": 1.9081137015461034e-06, + "loss": 0.92644691, + "num_input_tokens_seen": 95090985, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.51171875, + "step": 4401, + "time_per_iteration": 3.0147740840911865 + }, + { + "auxiliary_loss_clip": 0.01522599, + "auxiliary_loss_mlp": 0.01279767, + "balance_loss_clip": 1.16365051, + "balance_loss_mlp": 1.02647138, + "epoch": 0.5293092045932785, + "flos": 19645655398560.0, + "grad_norm": 3.5448567292428033, + "language_loss": 0.90530318, + "learning_rate": 1.9073355594384383e-06, + "loss": 0.93332684, + "num_input_tokens_seen": 95109225, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.53125, + "step": 4402, + "time_per_iteration": 2.98879075050354 + }, + { + "auxiliary_loss_clip": 0.01523616, + "auxiliary_loss_mlp": 0.01280307, + "balance_loss_clip": 1.16519499, + "balance_loss_mlp": 1.02853656, + "epoch": 0.5294294474839175, + "flos": 24320120182560.0, + "grad_norm": 2.426238012760507, + "language_loss": 0.80776507, + "learning_rate": 1.906557431387895e-06, + "loss": 0.83580422, + "num_input_tokens_seen": 95128215, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.515625, + "step": 4403, + "time_per_iteration": 3.1118855476379395 + }, + { + "auxiliary_loss_clip": 0.01522082, + "auxiliary_loss_mlp": 0.01286099, + "balance_loss_clip": 1.16337156, + "balance_loss_mlp": 1.03413808, + "epoch": 0.5295496903745566, + "flos": 18877716292320.0, + "grad_norm": 2.3693770989763756, + "language_loss": 0.78979468, + "learning_rate": 1.905779317512516e-06, + "loss": 0.81787652, + "num_input_tokens_seen": 95145760, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.51757812, + "step": 4404, + "time_per_iteration": 4.006296634674072 + }, + { + "auxiliary_loss_clip": 0.01520379, + "auxiliary_loss_mlp": 0.01277721, + "balance_loss_clip": 1.16207099, + "balance_loss_mlp": 1.02576029, + "epoch": 0.5296699332651957, + "flos": 20925212555520.0, + "grad_norm": 2.3432124348494243, + "language_loss": 0.80398113, + "learning_rate": 1.9050012179303385e-06, + "loss": 0.83196211, + "num_input_tokens_seen": 95164270, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.51757812, + "step": 4405, + "time_per_iteration": 3.8287644386291504 + }, + { + "auxiliary_loss_clip": 0.01519345, + "auxiliary_loss_mlp": 0.01274958, + "balance_loss_clip": 1.16060829, + "balance_loss_mlp": 1.01975441, + "epoch": 0.5297901761558348, + "flos": 22048504236000.0, + "grad_norm": 4.846260699798774, + "language_loss": 0.69550812, + "learning_rate": 1.904223132759401e-06, + "loss": 0.7234512, + "num_input_tokens_seen": 95182870, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.55078125, + "step": 4406, + "time_per_iteration": 3.0835461616516113 + }, + { + "auxiliary_loss_clip": 0.01517919, + "auxiliary_loss_mlp": 0.01282104, + "balance_loss_clip": 1.15987253, + "balance_loss_mlp": 1.02976203, + "epoch": 0.5299104190464738, + "flos": 21800641160160.0, + "grad_norm": 3.696318073971675, + "language_loss": 0.69266838, + "learning_rate": 1.9034450621177383e-06, + "loss": 0.72066855, + "num_input_tokens_seen": 95201190, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.52148438, + "step": 4407, + "time_per_iteration": 3.073434352874756 + }, + { + "auxiliary_loss_clip": 0.01523408, + "auxiliary_loss_mlp": 0.01298541, + "balance_loss_clip": 1.16254508, + "balance_loss_mlp": 1.04257488, + "epoch": 0.530030661937113, + "flos": 14722568975520.0, + "grad_norm": 3.099164376444363, + "language_loss": 0.70246452, + "learning_rate": 1.9026670061233824e-06, + "loss": 0.73068404, + "num_input_tokens_seen": 95218625, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.55859375, + "step": 4408, + "time_per_iteration": 3.0009961128234863 + }, + { + "auxiliary_loss_clip": 0.01510503, + "auxiliary_loss_mlp": 0.01275809, + "balance_loss_clip": 1.14973319, + "balance_loss_mlp": 1.02499247, + "epoch": 0.5301509048277521, + "flos": 21253635771840.0, + "grad_norm": 2.2943394406869277, + "language_loss": 0.80820966, + "learning_rate": 1.901888964894365e-06, + "loss": 0.8360728, + "num_input_tokens_seen": 95237665, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.50585938, + "step": 4409, + "time_per_iteration": 3.8807058334350586 + }, + { + "auxiliary_loss_clip": 0.01513821, + "auxiliary_loss_mlp": 0.01267695, + "balance_loss_clip": 1.15411282, + "balance_loss_mlp": 1.01459002, + "epoch": 0.5302711477183911, + "flos": 25959391652160.0, + "grad_norm": 3.2308690839903655, + "language_loss": 0.68299389, + "learning_rate": 1.9011109385487134e-06, + "loss": 0.71080899, + "num_input_tokens_seen": 95258915, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.52929688, + "step": 4410, + "time_per_iteration": 2.9629807472229004 + }, + { + "auxiliary_loss_clip": 0.01514122, + "auxiliary_loss_mlp": 0.01268884, + "balance_loss_clip": 1.15459538, + "balance_loss_mlp": 1.01539683, + "epoch": 0.5303913906090303, + "flos": 22275316818720.0, + "grad_norm": 2.9810093971975715, + "language_loss": 0.66467482, + "learning_rate": 1.900332927204454e-06, + "loss": 0.69250488, + "num_input_tokens_seen": 95277365, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.53320312, + "step": 4411, + "time_per_iteration": 2.947357177734375 + }, + { + "auxiliary_loss_clip": 0.01514871, + "auxiliary_loss_mlp": 0.01277368, + "balance_loss_clip": 1.15491354, + "balance_loss_mlp": 1.02216458, + "epoch": 0.5305116334996693, + "flos": 24938317455840.0, + "grad_norm": 3.0490042399091792, + "language_loss": 0.76816773, + "learning_rate": 1.8995549309796097e-06, + "loss": 0.79609013, + "num_input_tokens_seen": 95296670, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.55078125, + "step": 4412, + "time_per_iteration": 3.8309683799743652 + }, + { + "auxiliary_loss_clip": 0.01511905, + "auxiliary_loss_mlp": 0.01280292, + "balance_loss_clip": 1.15156817, + "balance_loss_mlp": 1.02642369, + "epoch": 0.5306318763903084, + "flos": 20191181588640.0, + "grad_norm": 1.8989734311456627, + "language_loss": 0.76938546, + "learning_rate": 1.8987769499922028e-06, + "loss": 0.79730737, + "num_input_tokens_seen": 95315640, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.53710938, + "step": 4413, + "time_per_iteration": 2.9831368923187256 + }, + { + "auxiliary_loss_clip": 0.01512955, + "auxiliary_loss_mlp": 0.01271094, + "balance_loss_clip": 1.15413523, + "balance_loss_mlp": 1.01894236, + "epoch": 0.5307521192809476, + "flos": 20268441979200.0, + "grad_norm": 2.7443851827268366, + "language_loss": 0.71443921, + "learning_rate": 1.897998984360252e-06, + "loss": 0.74227971, + "num_input_tokens_seen": 95334610, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.51953125, + "step": 4414, + "time_per_iteration": 3.1651768684387207 + }, + { + "auxiliary_loss_clip": 0.01517865, + "auxiliary_loss_mlp": 0.01285639, + "balance_loss_clip": 1.15679812, + "balance_loss_mlp": 1.03405988, + "epoch": 0.5308723621715866, + "flos": 28847042966880.0, + "grad_norm": 1.8424643604189221, + "language_loss": 0.78536373, + "learning_rate": 1.897221034201775e-06, + "loss": 0.81339878, + "num_input_tokens_seen": 95358350, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.51367188, + "step": 4415, + "time_per_iteration": 3.1334941387176514 + }, + { + "auxiliary_loss_clip": 0.01509408, + "auxiliary_loss_mlp": 0.01273776, + "balance_loss_clip": 1.14928508, + "balance_loss_mlp": 1.02353168, + "epoch": 0.5309926050622257, + "flos": 27460451449440.0, + "grad_norm": 1.6039439207761184, + "language_loss": 0.67124516, + "learning_rate": 1.8964430996347842e-06, + "loss": 0.69907701, + "num_input_tokens_seen": 95379900, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.5, + "step": 4416, + "time_per_iteration": 2.9889345169067383 + }, + { + "auxiliary_loss_clip": 0.0151388, + "auxiliary_loss_mlp": 0.01286615, + "balance_loss_clip": 1.1532979, + "balance_loss_mlp": 1.0316025, + "epoch": 0.5311128479528648, + "flos": 20516229198720.0, + "grad_norm": 3.33688087328188, + "language_loss": 0.82823408, + "learning_rate": 1.8956651807772931e-06, + "loss": 0.85623902, + "num_input_tokens_seen": 95397935, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.54882812, + "step": 4417, + "time_per_iteration": 3.065365791320801 + }, + { + "auxiliary_loss_clip": 0.01513549, + "auxiliary_loss_mlp": 0.0127285, + "balance_loss_clip": 1.15411472, + "balance_loss_mlp": 1.0224148, + "epoch": 0.5312330908435039, + "flos": 21399812357760.0, + "grad_norm": 1.669726153737056, + "language_loss": 0.83970904, + "learning_rate": 1.8948872777473115e-06, + "loss": 0.86757302, + "num_input_tokens_seen": 95415890, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.50195312, + "step": 4418, + "time_per_iteration": 2.9312798976898193 + }, + { + "auxiliary_loss_clip": 0.01516275, + "auxiliary_loss_mlp": 0.01283573, + "balance_loss_clip": 1.15650344, + "balance_loss_mlp": 1.02932322, + "epoch": 0.531353333734143, + "flos": 24719659427520.0, + "grad_norm": 2.2321848801863564, + "language_loss": 0.63329351, + "learning_rate": 1.8941093906628458e-06, + "loss": 0.66129196, + "num_input_tokens_seen": 95433675, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.54101562, + "step": 4419, + "time_per_iteration": 3.047727346420288 + }, + { + "auxiliary_loss_clip": 0.01514344, + "auxiliary_loss_mlp": 0.01282215, + "balance_loss_clip": 1.15403235, + "balance_loss_mlp": 1.03006327, + "epoch": 0.531473576624782, + "flos": 30483242255520.0, + "grad_norm": 2.191436684294826, + "language_loss": 0.70713228, + "learning_rate": 1.893331519641902e-06, + "loss": 0.73509789, + "num_input_tokens_seen": 95455820, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.51953125, + "step": 4420, + "time_per_iteration": 3.1123907566070557 + }, + { + "auxiliary_loss_clip": 0.01510292, + "auxiliary_loss_mlp": 0.0129407, + "balance_loss_clip": 1.15008903, + "balance_loss_mlp": 1.04058349, + "epoch": 0.5315938195154212, + "flos": 23005213616160.0, + "grad_norm": 2.8575666243335944, + "language_loss": 0.74094725, + "learning_rate": 1.8925536648024815e-06, + "loss": 0.76899087, + "num_input_tokens_seen": 95473240, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.53320312, + "step": 4421, + "time_per_iteration": 2.981598377227783 + }, + { + "auxiliary_loss_clip": 0.01510216, + "auxiliary_loss_mlp": 0.01280362, + "balance_loss_clip": 1.15108371, + "balance_loss_mlp": 1.026685, + "epoch": 0.5317140624060602, + "flos": 22750864824960.0, + "grad_norm": 1.8773046060683087, + "language_loss": 0.75814497, + "learning_rate": 1.8917758262625849e-06, + "loss": 0.7860508, + "num_input_tokens_seen": 95493480, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.53515625, + "step": 4422, + "time_per_iteration": 3.149517297744751 + }, + { + "auxiliary_loss_clip": 0.01509368, + "auxiliary_loss_mlp": 0.01280249, + "balance_loss_clip": 1.14859653, + "balance_loss_mlp": 1.02962303, + "epoch": 0.5318343052966993, + "flos": 22823346267360.0, + "grad_norm": 1.7954309640300512, + "language_loss": 0.80850923, + "learning_rate": 1.8909980041402089e-06, + "loss": 0.8364054, + "num_input_tokens_seen": 95512075, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.50390625, + "step": 4423, + "time_per_iteration": 3.1069529056549072 + }, + { + "auxiliary_loss_clip": 0.01512884, + "auxiliary_loss_mlp": 0.01275881, + "balance_loss_clip": 1.15238643, + "balance_loss_mlp": 1.02144051, + "epoch": 0.5319545481873384, + "flos": 13627799635680.0, + "grad_norm": 2.5177928653959776, + "language_loss": 0.65820283, + "learning_rate": 1.8902201985533494e-06, + "loss": 0.68609047, + "num_input_tokens_seen": 95529340, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.54296875, + "step": 4424, + "time_per_iteration": 3.1066489219665527 + }, + { + "auxiliary_loss_clip": 0.01510792, + "auxiliary_loss_mlp": 0.012667, + "balance_loss_clip": 1.15012693, + "balance_loss_mlp": 1.01512027, + "epoch": 0.5320747910779775, + "flos": 22164641354880.0, + "grad_norm": 1.8824752893782282, + "language_loss": 0.74872047, + "learning_rate": 1.8894424096199983e-06, + "loss": 0.77649534, + "num_input_tokens_seen": 95548545, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.51367188, + "step": 4425, + "time_per_iteration": 3.0851547718048096 + }, + { + "auxiliary_loss_clip": 0.01516563, + "auxiliary_loss_mlp": 0.01275525, + "balance_loss_clip": 1.15662336, + "balance_loss_mlp": 1.02108502, + "epoch": 0.5321950339686166, + "flos": 18590066212320.0, + "grad_norm": 2.179357370095679, + "language_loss": 0.85877514, + "learning_rate": 1.8886646374581463e-06, + "loss": 0.88669598, + "num_input_tokens_seen": 95567770, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.54296875, + "step": 4426, + "time_per_iteration": 3.0995254516601562 + }, + { + "auxiliary_loss_clip": 0.01515934, + "auxiliary_loss_mlp": 0.0129006, + "balance_loss_clip": 1.15498579, + "balance_loss_mlp": 1.03333139, + "epoch": 0.5323152768592557, + "flos": 22859037030240.0, + "grad_norm": 2.0971757627821987, + "language_loss": 0.71536762, + "learning_rate": 1.8878868821857795e-06, + "loss": 0.74342757, + "num_input_tokens_seen": 95587420, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.56640625, + "step": 4427, + "time_per_iteration": 3.1099629402160645 + }, + { + "auxiliary_loss_clip": 0.01516986, + "auxiliary_loss_mlp": 0.01289476, + "balance_loss_clip": 1.15798903, + "balance_loss_mlp": 1.03427243, + "epoch": 0.5324355197498948, + "flos": 33951541600800.0, + "grad_norm": 3.773659894282765, + "language_loss": 0.74594903, + "learning_rate": 1.8871091439208838e-06, + "loss": 0.77401364, + "num_input_tokens_seen": 95609030, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.55078125, + "step": 4428, + "time_per_iteration": 3.120237112045288 + }, + { + "auxiliary_loss_clip": 0.01516366, + "auxiliary_loss_mlp": 0.01278565, + "balance_loss_clip": 1.15701997, + "balance_loss_mlp": 1.02431536, + "epoch": 0.5325557626405338, + "flos": 23258879700480.0, + "grad_norm": 2.2826654585547206, + "language_loss": 0.7752012, + "learning_rate": 1.8863314227814414e-06, + "loss": 0.80315059, + "num_input_tokens_seen": 95627340, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.54101562, + "step": 4429, + "time_per_iteration": 2.9923408031463623 + }, + { + "auxiliary_loss_clip": 0.01523389, + "auxiliary_loss_mlp": 0.01276821, + "balance_loss_clip": 1.16454649, + "balance_loss_mlp": 1.02180827, + "epoch": 0.532676005531173, + "flos": 26720807114880.0, + "grad_norm": 3.0239447748134785, + "language_loss": 0.49106687, + "learning_rate": 1.8855537188854313e-06, + "loss": 0.51906896, + "num_input_tokens_seen": 95646315, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.54882812, + "step": 4430, + "time_per_iteration": 3.066352367401123 + }, + { + "auxiliary_loss_clip": 0.01512216, + "auxiliary_loss_mlp": 0.01277483, + "balance_loss_clip": 1.15361524, + "balance_loss_mlp": 1.02361488, + "epoch": 0.5327962484218121, + "flos": 17896542884640.0, + "grad_norm": 2.9234087465048506, + "language_loss": 0.78295219, + "learning_rate": 1.8847760323508315e-06, + "loss": 0.81084919, + "num_input_tokens_seen": 95665220, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.53710938, + "step": 4431, + "time_per_iteration": 2.961045503616333 + }, + { + "auxiliary_loss_clip": 0.01510011, + "auxiliary_loss_mlp": 0.01280115, + "balance_loss_clip": 1.14803481, + "balance_loss_mlp": 1.02948952, + "epoch": 0.5329164913124511, + "flos": 17926999561440.0, + "grad_norm": 1.9023590207376528, + "language_loss": 0.7585398, + "learning_rate": 1.883998363295616e-06, + "loss": 0.78644109, + "num_input_tokens_seen": 95682700, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.50390625, + "step": 4432, + "time_per_iteration": 4.771336793899536 + }, + { + "auxiliary_loss_clip": 0.01492161, + "auxiliary_loss_mlp": 0.01205658, + "balance_loss_clip": 1.1373589, + "balance_loss_mlp": 1.00119019, + "epoch": 0.5330367342030903, + "flos": 57259879986240.0, + "grad_norm": 0.873223254571641, + "language_loss": 0.62591398, + "learning_rate": 1.8832207118377565e-06, + "loss": 0.65289211, + "num_input_tokens_seen": 95738070, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.046875, + "step": 4433, + "time_per_iteration": 3.357466220855713 + }, + { + "auxiliary_loss_clip": 0.01515317, + "auxiliary_loss_mlp": 0.01272429, + "balance_loss_clip": 1.1559515, + "balance_loss_mlp": 1.02199411, + "epoch": 0.5331569770937293, + "flos": 17422436148480.0, + "grad_norm": 1.986071429698426, + "language_loss": 0.69399083, + "learning_rate": 1.882443078095222e-06, + "loss": 0.72186828, + "num_input_tokens_seen": 95756950, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.50195312, + "step": 4434, + "time_per_iteration": 2.9842772483825684 + }, + { + "auxiliary_loss_clip": 0.01489313, + "auxiliary_loss_mlp": 0.01203568, + "balance_loss_clip": 1.13501906, + "balance_loss_mlp": 0.99757385, + "epoch": 0.5332772199843684, + "flos": 56756871627840.0, + "grad_norm": 0.8904849646032927, + "language_loss": 0.66759741, + "learning_rate": 1.8816654621859794e-06, + "loss": 0.6945262, + "num_input_tokens_seen": 95816615, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.0625, + "step": 4435, + "time_per_iteration": 3.2772014141082764 + }, + { + "auxiliary_loss_clip": 0.01514072, + "auxiliary_loss_mlp": 0.01279218, + "balance_loss_clip": 1.15417123, + "balance_loss_mlp": 1.0272572, + "epoch": 0.5333974628750076, + "flos": 18699755544000.0, + "grad_norm": 2.3147340051470024, + "language_loss": 0.7273832, + "learning_rate": 1.8808878642279915e-06, + "loss": 0.75531608, + "num_input_tokens_seen": 95832020, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.51757812, + "step": 4436, + "time_per_iteration": 3.8055307865142822 + }, + { + "auxiliary_loss_clip": 0.01517737, + "auxiliary_loss_mlp": 0.01283937, + "balance_loss_clip": 1.15675044, + "balance_loss_mlp": 1.02892458, + "epoch": 0.5335177057656466, + "flos": 23807781496800.0, + "grad_norm": 2.412235717202223, + "language_loss": 0.64894634, + "learning_rate": 1.8801102843392209e-06, + "loss": 0.67696309, + "num_input_tokens_seen": 95851425, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.54882812, + "step": 4437, + "time_per_iteration": 3.0694241523742676 + }, + { + "auxiliary_loss_clip": 0.01513578, + "auxiliary_loss_mlp": 0.01279545, + "balance_loss_clip": 1.15328681, + "balance_loss_mlp": 1.02777481, + "epoch": 0.5336379486562857, + "flos": 25080815010240.0, + "grad_norm": 1.6397589278271543, + "language_loss": 0.85094875, + "learning_rate": 1.8793327226376238e-06, + "loss": 0.87888002, + "num_input_tokens_seen": 95870745, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.515625, + "step": 4438, + "time_per_iteration": 3.0465142726898193 + }, + { + "auxiliary_loss_clip": 0.01517487, + "auxiliary_loss_mlp": 0.01280844, + "balance_loss_clip": 1.15734982, + "balance_loss_mlp": 1.02697563, + "epoch": 0.5337581915469248, + "flos": 21398788297440.0, + "grad_norm": 3.8662462061919785, + "language_loss": 0.79656225, + "learning_rate": 1.8785551792411569e-06, + "loss": 0.82454556, + "num_input_tokens_seen": 95889755, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.53710938, + "step": 4439, + "time_per_iteration": 3.0659968852996826 + }, + { + "auxiliary_loss_clip": 0.01516441, + "auxiliary_loss_mlp": 0.01277741, + "balance_loss_clip": 1.15751147, + "balance_loss_mlp": 1.02673364, + "epoch": 0.5338784344375639, + "flos": 14867645644800.0, + "grad_norm": 2.2778422994273493, + "language_loss": 0.82476825, + "learning_rate": 1.8777776542677733e-06, + "loss": 0.85271007, + "num_input_tokens_seen": 95907805, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.5078125, + "step": 4440, + "time_per_iteration": 4.020503997802734 + }, + { + "auxiliary_loss_clip": 0.01511238, + "auxiliary_loss_mlp": 0.01273238, + "balance_loss_clip": 1.15135145, + "balance_loss_mlp": 1.01841629, + "epoch": 0.5339986773282029, + "flos": 20815599080160.0, + "grad_norm": 2.5577460686101774, + "language_loss": 0.73001343, + "learning_rate": 1.8770001478354216e-06, + "loss": 0.75785816, + "num_input_tokens_seen": 95927480, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.546875, + "step": 4441, + "time_per_iteration": 3.00765323638916 + }, + { + "auxiliary_loss_clip": 0.01516931, + "auxiliary_loss_mlp": 0.01282031, + "balance_loss_clip": 1.15613806, + "balance_loss_mlp": 1.03045201, + "epoch": 0.5341189202188421, + "flos": 17971110375840.0, + "grad_norm": 2.3930885560018718, + "language_loss": 0.83913034, + "learning_rate": 1.8762226600620504e-06, + "loss": 0.86711991, + "num_input_tokens_seen": 95946095, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.51367188, + "step": 4442, + "time_per_iteration": 3.0732409954071045 + }, + { + "auxiliary_loss_clip": 0.01520281, + "auxiliary_loss_mlp": 0.01280068, + "balance_loss_clip": 1.16024899, + "balance_loss_mlp": 1.02562761, + "epoch": 0.5342391631094812, + "flos": 11033222127840.0, + "grad_norm": 2.616698545683387, + "language_loss": 0.58999383, + "learning_rate": 1.8754451910656031e-06, + "loss": 0.61799729, + "num_input_tokens_seen": 95959995, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.54296875, + "step": 4443, + "time_per_iteration": 3.009793281555176 + }, + { + "auxiliary_loss_clip": 0.01519914, + "auxiliary_loss_mlp": 0.01279759, + "balance_loss_clip": 1.16184306, + "balance_loss_mlp": 1.02341104, + "epoch": 0.5343594060001202, + "flos": 15340842105120.0, + "grad_norm": 3.9848832432160797, + "language_loss": 0.82621241, + "learning_rate": 1.8746677409640212e-06, + "loss": 0.85420918, + "num_input_tokens_seen": 95977095, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.5625, + "step": 4444, + "time_per_iteration": 3.0246236324310303 + }, + { + "auxiliary_loss_clip": 0.01513261, + "auxiliary_loss_mlp": 0.01285451, + "balance_loss_clip": 1.15226698, + "balance_loss_mlp": 1.03406215, + "epoch": 0.5344796488907594, + "flos": 26903015817120.0, + "grad_norm": 2.404920207313695, + "language_loss": 0.8475554, + "learning_rate": 1.8738903098752432e-06, + "loss": 0.87554252, + "num_input_tokens_seen": 95996225, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.51171875, + "step": 4445, + "time_per_iteration": 3.101085662841797 + }, + { + "auxiliary_loss_clip": 0.01523768, + "auxiliary_loss_mlp": 0.01279333, + "balance_loss_clip": 1.16286027, + "balance_loss_mlp": 1.02718163, + "epoch": 0.5345998917813984, + "flos": 25413599964960.0, + "grad_norm": 2.537462489614473, + "language_loss": 0.73828638, + "learning_rate": 1.8731128979172052e-06, + "loss": 0.76631743, + "num_input_tokens_seen": 96015425, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.51953125, + "step": 4446, + "time_per_iteration": 3.028440475463867 + }, + { + "auxiliary_loss_clip": 0.01507267, + "auxiliary_loss_mlp": 0.01276823, + "balance_loss_clip": 1.14639258, + "balance_loss_mlp": 1.02734232, + "epoch": 0.5347201346720375, + "flos": 32856127482240.0, + "grad_norm": 3.1174774072312803, + "language_loss": 0.671866, + "learning_rate": 1.8723355052078394e-06, + "loss": 0.69970685, + "num_input_tokens_seen": 96035460, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.4921875, + "step": 4447, + "time_per_iteration": 3.102715492248535 + }, + { + "auxiliary_loss_clip": 0.01514487, + "auxiliary_loss_mlp": 0.01284168, + "balance_loss_clip": 1.15363288, + "balance_loss_mlp": 1.02972794, + "epoch": 0.5348403775626767, + "flos": 17969896674720.0, + "grad_norm": 2.339252390716237, + "language_loss": 0.77141786, + "learning_rate": 1.8715581318650765e-06, + "loss": 0.7994045, + "num_input_tokens_seen": 96054515, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.54296875, + "step": 4448, + "time_per_iteration": 3.0778911113739014 + }, + { + "auxiliary_loss_clip": 0.01525594, + "auxiliary_loss_mlp": 0.01295082, + "balance_loss_clip": 1.16455936, + "balance_loss_mlp": 1.03816223, + "epoch": 0.5349606204533157, + "flos": 17605289629440.0, + "grad_norm": 2.515434546674567, + "language_loss": 0.81892872, + "learning_rate": 1.8707807780068422e-06, + "loss": 0.84713548, + "num_input_tokens_seen": 96072330, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.56835938, + "step": 4449, + "time_per_iteration": 3.0076491832733154 + }, + { + "auxiliary_loss_clip": 0.01513836, + "auxiliary_loss_mlp": 0.01271658, + "balance_loss_clip": 1.15439248, + "balance_loss_mlp": 1.01950645, + "epoch": 0.5350808633439548, + "flos": 29170345881600.0, + "grad_norm": 2.145654983161443, + "language_loss": 0.66419232, + "learning_rate": 1.8700034437510611e-06, + "loss": 0.6920473, + "num_input_tokens_seen": 96092425, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.51953125, + "step": 4450, + "time_per_iteration": 3.096036434173584 + }, + { + "auxiliary_loss_clip": 0.01513823, + "auxiliary_loss_mlp": 0.01279209, + "balance_loss_clip": 1.15186834, + "balance_loss_mlp": 1.02743864, + "epoch": 0.5352011062345938, + "flos": 19502019999360.0, + "grad_norm": 2.605421460406085, + "language_loss": 0.81745929, + "learning_rate": 1.8692261292156549e-06, + "loss": 0.8453896, + "num_input_tokens_seen": 96111660, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.515625, + "step": 4451, + "time_per_iteration": 3.0580272674560547 + }, + { + "auxiliary_loss_clip": 0.01515758, + "auxiliary_loss_mlp": 0.01272213, + "balance_loss_clip": 1.15411258, + "balance_loss_mlp": 1.02158701, + "epoch": 0.535321349125233, + "flos": 23479927202880.0, + "grad_norm": 2.378293746564143, + "language_loss": 0.81154841, + "learning_rate": 1.8684488345185401e-06, + "loss": 0.83942819, + "num_input_tokens_seen": 96131835, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.50390625, + "step": 4452, + "time_per_iteration": 2.990020751953125 + }, + { + "auxiliary_loss_clip": 0.01526249, + "auxiliary_loss_mlp": 0.01287748, + "balance_loss_clip": 1.16623342, + "balance_loss_mlp": 1.03349876, + "epoch": 0.535441592015872, + "flos": 20479741944480.0, + "grad_norm": 2.2466153967359337, + "language_loss": 0.78903472, + "learning_rate": 1.8676715597776332e-06, + "loss": 0.81717479, + "num_input_tokens_seen": 96150180, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.54101562, + "step": 4453, + "time_per_iteration": 2.9493603706359863 + }, + { + "auxiliary_loss_clip": 0.01515713, + "auxiliary_loss_mlp": 0.01266591, + "balance_loss_clip": 1.15507197, + "balance_loss_mlp": 1.01710963, + "epoch": 0.5355618349065111, + "flos": 19575184148640.0, + "grad_norm": 8.021286604408699, + "language_loss": 0.76595747, + "learning_rate": 1.8668943051108455e-06, + "loss": 0.79378057, + "num_input_tokens_seen": 96167485, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.4921875, + "step": 4454, + "time_per_iteration": 2.980151414871216 + }, + { + "auxiliary_loss_clip": 0.01513775, + "auxiliary_loss_mlp": 0.0128195, + "balance_loss_clip": 1.15329826, + "balance_loss_mlp": 1.02731895, + "epoch": 0.5356820777971503, + "flos": 24026932591200.0, + "grad_norm": 2.9802756915295627, + "language_loss": 0.76583064, + "learning_rate": 1.8661170706360856e-06, + "loss": 0.7937879, + "num_input_tokens_seen": 96186650, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.54492188, + "step": 4455, + "time_per_iteration": 3.0920298099517822 + }, + { + "auxiliary_loss_clip": 0.01512067, + "auxiliary_loss_mlp": 0.01267995, + "balance_loss_clip": 1.15045691, + "balance_loss_mlp": 1.01794171, + "epoch": 0.5358023206877893, + "flos": 20886677180640.0, + "grad_norm": 1.7059642331875822, + "language_loss": 0.8138203, + "learning_rate": 1.8653398564712594e-06, + "loss": 0.84162092, + "num_input_tokens_seen": 96205595, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.49804688, + "step": 4456, + "time_per_iteration": 2.9716570377349854 + }, + { + "auxiliary_loss_clip": 0.01513183, + "auxiliary_loss_mlp": 0.01280341, + "balance_loss_clip": 1.15173841, + "balance_loss_mlp": 1.0295248, + "epoch": 0.5359225635784284, + "flos": 22421189979360.0, + "grad_norm": 2.182717855847292, + "language_loss": 0.82639319, + "learning_rate": 1.8645626627342704e-06, + "loss": 0.85432839, + "num_input_tokens_seen": 96226360, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.50585938, + "step": 4457, + "time_per_iteration": 2.93216609954834 + }, + { + "auxiliary_loss_clip": 0.01512972, + "auxiliary_loss_mlp": 0.01279827, + "balance_loss_clip": 1.15159106, + "balance_loss_mlp": 1.02920127, + "epoch": 0.5360428064690675, + "flos": 24100096740480.0, + "grad_norm": 2.4105251557811034, + "language_loss": 0.8084479, + "learning_rate": 1.8637854895430172e-06, + "loss": 0.83637589, + "num_input_tokens_seen": 96245625, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.50390625, + "step": 4458, + "time_per_iteration": 3.025728702545166 + }, + { + "auxiliary_loss_clip": 0.01513344, + "auxiliary_loss_mlp": 0.01283195, + "balance_loss_clip": 1.15271878, + "balance_loss_mlp": 1.02932703, + "epoch": 0.5361630493597066, + "flos": 21436678893600.0, + "grad_norm": 2.3706544232599422, + "language_loss": 0.69482982, + "learning_rate": 1.8630083370153978e-06, + "loss": 0.72279525, + "num_input_tokens_seen": 96265265, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.53710938, + "step": 4459, + "time_per_iteration": 3.8892385959625244 + }, + { + "auxiliary_loss_clip": 0.01487763, + "auxiliary_loss_mlp": 0.01224632, + "balance_loss_clip": 1.13166809, + "balance_loss_mlp": 1.01940155, + "epoch": 0.5362832922503457, + "flos": 68894800637760.0, + "grad_norm": 0.7557312449931849, + "language_loss": 0.5525713, + "learning_rate": 1.8622312052693041e-06, + "loss": 0.57969528, + "num_input_tokens_seen": 96326445, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0546875, + "step": 4460, + "time_per_iteration": 4.339648485183716 + }, + { + "auxiliary_loss_clip": 0.01504154, + "auxiliary_loss_mlp": 0.01272659, + "balance_loss_clip": 1.14185333, + "balance_loss_mlp": 1.02184296, + "epoch": 0.5364035351409848, + "flos": 9795120814080.0, + "grad_norm": 3.466654072472923, + "language_loss": 0.72019076, + "learning_rate": 1.8614540944226267e-06, + "loss": 0.7479589, + "num_input_tokens_seen": 96343115, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.50585938, + "step": 4461, + "time_per_iteration": 3.1560282707214355 + }, + { + "auxiliary_loss_clip": 0.01520758, + "auxiliary_loss_mlp": 0.01275745, + "balance_loss_clip": 1.15903997, + "balance_loss_mlp": 1.02569163, + "epoch": 0.5365237780316239, + "flos": 23292067204800.0, + "grad_norm": 1.7778042972084862, + "language_loss": 0.68270278, + "learning_rate": 1.8606770045932537e-06, + "loss": 0.71066779, + "num_input_tokens_seen": 96362230, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.49804688, + "step": 4462, + "time_per_iteration": 3.0987613201141357 + }, + { + "auxiliary_loss_clip": 0.0151681, + "auxiliary_loss_mlp": 0.01280143, + "balance_loss_clip": 1.15638328, + "balance_loss_mlp": 1.02799177, + "epoch": 0.5366440209222629, + "flos": 26580547321920.0, + "grad_norm": 1.9385440754832384, + "language_loss": 0.81787366, + "learning_rate": 1.859899935899068e-06, + "loss": 0.8458432, + "num_input_tokens_seen": 96382085, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.51953125, + "step": 4463, + "time_per_iteration": 3.9373722076416016 + }, + { + "auxiliary_loss_clip": 0.01518166, + "auxiliary_loss_mlp": 0.01281702, + "balance_loss_clip": 1.15785182, + "balance_loss_mlp": 1.02993202, + "epoch": 0.5367642638129021, + "flos": 19610002563840.0, + "grad_norm": 1.6663026555974239, + "language_loss": 0.7910713, + "learning_rate": 1.8591228884579506e-06, + "loss": 0.81906998, + "num_input_tokens_seen": 96400580, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.515625, + "step": 4464, + "time_per_iteration": 3.0475168228149414 + }, + { + "auxiliary_loss_clip": 0.01508939, + "auxiliary_loss_mlp": 0.01283221, + "balance_loss_clip": 1.14652729, + "balance_loss_mlp": 1.03145075, + "epoch": 0.5368845067035412, + "flos": 23917508756640.0, + "grad_norm": 2.3824785321256168, + "language_loss": 0.81872857, + "learning_rate": 1.8583458623877795e-06, + "loss": 0.84665024, + "num_input_tokens_seen": 96419680, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.515625, + "step": 4465, + "time_per_iteration": 3.037541627883911 + }, + { + "auxiliary_loss_clip": 0.01513225, + "auxiliary_loss_mlp": 0.01279148, + "balance_loss_clip": 1.15394092, + "balance_loss_mlp": 1.02699709, + "epoch": 0.5370047495941802, + "flos": 16875013550400.0, + "grad_norm": 2.6298298109038267, + "language_loss": 0.74274349, + "learning_rate": 1.8575688578064281e-06, + "loss": 0.7706672, + "num_input_tokens_seen": 96437805, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.51953125, + "step": 4466, + "time_per_iteration": 3.0187466144561768 + }, + { + "auxiliary_loss_clip": 0.01521628, + "auxiliary_loss_mlp": 0.01287959, + "balance_loss_clip": 1.16129446, + "balance_loss_mlp": 1.03523481, + "epoch": 0.5371249924848194, + "flos": 20743041781440.0, + "grad_norm": 1.8581221638248238, + "language_loss": 0.76563901, + "learning_rate": 1.8567918748317674e-06, + "loss": 0.79373485, + "num_input_tokens_seen": 96457155, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.52539062, + "step": 4467, + "time_per_iteration": 3.808624505996704 + }, + { + "auxiliary_loss_clip": 0.01510912, + "auxiliary_loss_mlp": 0.01278408, + "balance_loss_clip": 1.15003109, + "balance_loss_mlp": 1.02549362, + "epoch": 0.5372452353754584, + "flos": 17970086315520.0, + "grad_norm": 2.1870851028129557, + "language_loss": 0.83093828, + "learning_rate": 1.8560149135816659e-06, + "loss": 0.85883147, + "num_input_tokens_seen": 96473990, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.52734375, + "step": 4468, + "time_per_iteration": 2.954028844833374 + }, + { + "auxiliary_loss_clip": 0.01513558, + "auxiliary_loss_mlp": 0.01273385, + "balance_loss_clip": 1.15315962, + "balance_loss_mlp": 1.02466655, + "epoch": 0.5373654782660975, + "flos": 15378808557600.0, + "grad_norm": 2.391965085275428, + "language_loss": 0.84783959, + "learning_rate": 1.8552379741739873e-06, + "loss": 0.87570906, + "num_input_tokens_seen": 96491335, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.484375, + "step": 4469, + "time_per_iteration": 2.965373992919922 + }, + { + "auxiliary_loss_clip": 0.01491593, + "auxiliary_loss_mlp": 0.01195938, + "balance_loss_clip": 1.13592863, + "balance_loss_mlp": 0.99223328, + "epoch": 0.5374857211567367, + "flos": 69006310521120.0, + "grad_norm": 0.902306562511971, + "language_loss": 0.55562472, + "learning_rate": 1.8544610567265935e-06, + "loss": 0.58249998, + "num_input_tokens_seen": 96545275, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0390625, + "step": 4470, + "time_per_iteration": 3.443575859069824 + }, + { + "auxiliary_loss_clip": 0.01513313, + "auxiliary_loss_mlp": 0.0127689, + "balance_loss_clip": 1.15326238, + "balance_loss_mlp": 1.0279808, + "epoch": 0.5376059640473757, + "flos": 15087138092640.0, + "grad_norm": 2.056860466395727, + "language_loss": 0.83194232, + "learning_rate": 1.853684161357341e-06, + "loss": 0.85984433, + "num_input_tokens_seen": 96562935, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.48632812, + "step": 4471, + "time_per_iteration": 3.0224905014038086 + }, + { + "auxiliary_loss_clip": 0.01515275, + "auxiliary_loss_mlp": 0.01271746, + "balance_loss_clip": 1.15474021, + "balance_loss_mlp": 1.02283669, + "epoch": 0.5377262069380148, + "flos": 19794676596480.0, + "grad_norm": 1.8746666145990818, + "language_loss": 0.76935554, + "learning_rate": 1.852907288184085e-06, + "loss": 0.79722571, + "num_input_tokens_seen": 96581820, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.48632812, + "step": 4472, + "time_per_iteration": 3.0549395084381104 + }, + { + "auxiliary_loss_clip": 0.01519709, + "auxiliary_loss_mlp": 0.01275877, + "balance_loss_clip": 1.16030264, + "balance_loss_mlp": 1.02048337, + "epoch": 0.5378464498286539, + "flos": 30005418559680.0, + "grad_norm": 1.7955721671424592, + "language_loss": 0.70139402, + "learning_rate": 1.8521304373246762e-06, + "loss": 0.72934991, + "num_input_tokens_seen": 96602865, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.55273438, + "step": 4473, + "time_per_iteration": 3.1386029720306396 + }, + { + "auxiliary_loss_clip": 0.01517639, + "auxiliary_loss_mlp": 0.01277695, + "balance_loss_clip": 1.15865326, + "balance_loss_mlp": 1.02325439, + "epoch": 0.537966692719293, + "flos": 21253332346560.0, + "grad_norm": 3.587888568021733, + "language_loss": 0.88776612, + "learning_rate": 1.8513536088969626e-06, + "loss": 0.91571951, + "num_input_tokens_seen": 96620530, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.54296875, + "step": 4474, + "time_per_iteration": 2.92602801322937 + }, + { + "auxiliary_loss_clip": 0.01520772, + "auxiliary_loss_mlp": 0.01291738, + "balance_loss_clip": 1.15987206, + "balance_loss_mlp": 1.03653491, + "epoch": 0.538086935609932, + "flos": 21545306236800.0, + "grad_norm": 1.9297927814609193, + "language_loss": 0.80566859, + "learning_rate": 1.8505768030187884e-06, + "loss": 0.83379376, + "num_input_tokens_seen": 96640660, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.55078125, + "step": 4475, + "time_per_iteration": 3.0233654975891113 + }, + { + "auxiliary_loss_clip": 0.01512063, + "auxiliary_loss_mlp": 0.01275098, + "balance_loss_clip": 1.15113068, + "balance_loss_mlp": 1.0235188, + "epoch": 0.5382071785005712, + "flos": 22749423554880.0, + "grad_norm": 1.8612871120082684, + "language_loss": 0.80217409, + "learning_rate": 1.849800019807995e-06, + "loss": 0.8300457, + "num_input_tokens_seen": 96661885, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.51367188, + "step": 4476, + "time_per_iteration": 3.031132459640503 + }, + { + "auxiliary_loss_clip": 0.01517195, + "auxiliary_loss_mlp": 0.01287798, + "balance_loss_clip": 1.15757132, + "balance_loss_mlp": 1.03373957, + "epoch": 0.5383274213912103, + "flos": 24936800329440.0, + "grad_norm": 2.5977423293862563, + "language_loss": 0.71288431, + "learning_rate": 1.8490232593824186e-06, + "loss": 0.74093413, + "num_input_tokens_seen": 96678340, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.5390625, + "step": 4477, + "time_per_iteration": 3.002535343170166 + }, + { + "auxiliary_loss_clip": 0.01511171, + "auxiliary_loss_mlp": 0.0128087, + "balance_loss_clip": 1.15099442, + "balance_loss_mlp": 1.02967191, + "epoch": 0.5384476642818493, + "flos": 22312410923520.0, + "grad_norm": 2.605977705245302, + "language_loss": 0.8489604, + "learning_rate": 1.8482465218598935e-06, + "loss": 0.87688076, + "num_input_tokens_seen": 96698285, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.50976562, + "step": 4478, + "time_per_iteration": 3.07924747467041 + }, + { + "auxiliary_loss_clip": 0.01514214, + "auxiliary_loss_mlp": 0.0128754, + "balance_loss_clip": 1.15432429, + "balance_loss_mlp": 1.03405344, + "epoch": 0.5385679071724885, + "flos": 22713239725920.0, + "grad_norm": 1.9467439558844826, + "language_loss": 0.8354336, + "learning_rate": 1.8474698073582508e-06, + "loss": 0.86345112, + "num_input_tokens_seen": 96719655, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.53320312, + "step": 4479, + "time_per_iteration": 3.0022857189178467 + }, + { + "auxiliary_loss_clip": 0.01514003, + "auxiliary_loss_mlp": 0.01290525, + "balance_loss_clip": 1.15267372, + "balance_loss_mlp": 1.03703845, + "epoch": 0.5386881500631275, + "flos": 15955170706080.0, + "grad_norm": 2.390021926140428, + "language_loss": 0.87352836, + "learning_rate": 1.8466931159953166e-06, + "loss": 0.90157366, + "num_input_tokens_seen": 96736290, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.53320312, + "step": 4480, + "time_per_iteration": 3.057260036468506 + }, + { + "auxiliary_loss_clip": 0.01515001, + "auxiliary_loss_mlp": 0.01282286, + "balance_loss_clip": 1.15431213, + "balance_loss_mlp": 1.0293715, + "epoch": 0.5388083929537666, + "flos": 24062282000640.0, + "grad_norm": 6.776637638989515, + "language_loss": 0.84344965, + "learning_rate": 1.8459164478889158e-06, + "loss": 0.87142253, + "num_input_tokens_seen": 96757685, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.52734375, + "step": 4481, + "time_per_iteration": 3.1482553482055664 + }, + { + "auxiliary_loss_clip": 0.01511996, + "auxiliary_loss_mlp": 0.01275015, + "balance_loss_clip": 1.15157592, + "balance_loss_mlp": 1.02610588, + "epoch": 0.5389286358444056, + "flos": 22239095061600.0, + "grad_norm": 1.808727959136157, + "language_loss": 0.76197541, + "learning_rate": 1.8451398031568663e-06, + "loss": 0.78984547, + "num_input_tokens_seen": 96777310, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.48632812, + "step": 4482, + "time_per_iteration": 3.0732388496398926 + }, + { + "auxiliary_loss_clip": 0.01518026, + "auxiliary_loss_mlp": 0.01289719, + "balance_loss_clip": 1.158867, + "balance_loss_mlp": 1.03585136, + "epoch": 0.5390488787350448, + "flos": 24284125994400.0, + "grad_norm": 1.6826096756106659, + "language_loss": 0.74650109, + "learning_rate": 1.844363181916986e-06, + "loss": 0.77457851, + "num_input_tokens_seen": 96798035, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.53710938, + "step": 4483, + "time_per_iteration": 3.0680532455444336 + }, + { + "auxiliary_loss_clip": 0.01513262, + "auxiliary_loss_mlp": 0.01281475, + "balance_loss_clip": 1.1523844, + "balance_loss_mlp": 1.0277977, + "epoch": 0.5391691216256839, + "flos": 16583267229120.0, + "grad_norm": 1.9325696769442835, + "language_loss": 0.82927144, + "learning_rate": 1.8435865842870868e-06, + "loss": 0.8572188, + "num_input_tokens_seen": 96815975, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.53515625, + "step": 4484, + "time_per_iteration": 3.0994932651519775 + }, + { + "auxiliary_loss_clip": 0.0151219, + "auxiliary_loss_mlp": 0.01276622, + "balance_loss_clip": 1.15201569, + "balance_loss_mlp": 1.02447057, + "epoch": 0.5392893645163229, + "flos": 23332233490560.0, + "grad_norm": 2.210908119496367, + "language_loss": 0.72105598, + "learning_rate": 1.8428100103849787e-06, + "loss": 0.74894404, + "num_input_tokens_seen": 96835770, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.51953125, + "step": 4485, + "time_per_iteration": 3.1016180515289307 + }, + { + "auxiliary_loss_clip": 0.01516484, + "auxiliary_loss_mlp": 0.01289508, + "balance_loss_clip": 1.15650225, + "balance_loss_mlp": 1.03678393, + "epoch": 0.5394096074069621, + "flos": 15671654795520.0, + "grad_norm": 2.6970191311533216, + "language_loss": 0.73884082, + "learning_rate": 1.842033460328467e-06, + "loss": 0.76690072, + "num_input_tokens_seen": 96854490, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.52539062, + "step": 4486, + "time_per_iteration": 3.932729959487915 + }, + { + "auxiliary_loss_clip": 0.01514247, + "auxiliary_loss_mlp": 0.01276234, + "balance_loss_clip": 1.1544044, + "balance_loss_mlp": 1.02694321, + "epoch": 0.5395298502976011, + "flos": 22895675997120.0, + "grad_norm": 1.8408840112747857, + "language_loss": 0.75459266, + "learning_rate": 1.8412569342353541e-06, + "loss": 0.78249753, + "num_input_tokens_seen": 96874645, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.49023438, + "step": 4487, + "time_per_iteration": 3.839080572128296 + }, + { + "auxiliary_loss_clip": 0.01517909, + "auxiliary_loss_mlp": 0.01290836, + "balance_loss_clip": 1.15726089, + "balance_loss_mlp": 1.03582382, + "epoch": 0.5396500931882402, + "flos": 23844420463680.0, + "grad_norm": 2.014489768525784, + "language_loss": 0.84845507, + "learning_rate": 1.840480432223438e-06, + "loss": 0.87654251, + "num_input_tokens_seen": 96893650, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.54882812, + "step": 4488, + "time_per_iteration": 2.962980270385742 + }, + { + "auxiliary_loss_clip": 0.01512017, + "auxiliary_loss_mlp": 0.01272942, + "balance_loss_clip": 1.15238607, + "balance_loss_mlp": 1.02117157, + "epoch": 0.5397703360788794, + "flos": 26325402039360.0, + "grad_norm": 2.4187406010818746, + "language_loss": 0.77721274, + "learning_rate": 1.8397039544105131e-06, + "loss": 0.80506229, + "num_input_tokens_seen": 96912735, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.515625, + "step": 4489, + "time_per_iteration": 3.119647741317749 + }, + { + "auxiliary_loss_clip": 0.01511068, + "auxiliary_loss_mlp": 0.01275558, + "balance_loss_clip": 1.15201449, + "balance_loss_mlp": 1.02397919, + "epoch": 0.5398905789695184, + "flos": 21216693379680.0, + "grad_norm": 1.9901634138997846, + "language_loss": 0.69961679, + "learning_rate": 1.8389275009143711e-06, + "loss": 0.72748309, + "num_input_tokens_seen": 96932475, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.51367188, + "step": 4490, + "time_per_iteration": 2.9546382427215576 + }, + { + "auxiliary_loss_clip": 0.01507279, + "auxiliary_loss_mlp": 0.01275998, + "balance_loss_clip": 1.1465795, + "balance_loss_mlp": 1.02651644, + "epoch": 0.5400108218601575, + "flos": 25082559705600.0, + "grad_norm": 1.9213769034156518, + "language_loss": 0.73307455, + "learning_rate": 1.8381510718527988e-06, + "loss": 0.76090723, + "num_input_tokens_seen": 96952085, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.4921875, + "step": 4491, + "time_per_iteration": 3.8770415782928467 + }, + { + "auxiliary_loss_clip": 0.01508617, + "auxiliary_loss_mlp": 0.01283503, + "balance_loss_clip": 1.14659321, + "balance_loss_mlp": 1.0313518, + "epoch": 0.5401310647507966, + "flos": 26359803244800.0, + "grad_norm": 1.9483602774014348, + "language_loss": 0.63287294, + "learning_rate": 1.8373746673435812e-06, + "loss": 0.66079414, + "num_input_tokens_seen": 96973110, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.51953125, + "step": 4492, + "time_per_iteration": 3.0278658866882324 + }, + { + "auxiliary_loss_clip": 0.01505225, + "auxiliary_loss_mlp": 0.01277129, + "balance_loss_clip": 1.14350879, + "balance_loss_mlp": 1.02497756, + "epoch": 0.5402513076414357, + "flos": 27857828789280.0, + "grad_norm": 1.8507385167757415, + "language_loss": 0.78596115, + "learning_rate": 1.8365982875044964e-06, + "loss": 0.81378472, + "num_input_tokens_seen": 96993420, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.51953125, + "step": 4493, + "time_per_iteration": 2.9708468914031982 + }, + { + "auxiliary_loss_clip": 0.01509026, + "auxiliary_loss_mlp": 0.01275994, + "balance_loss_clip": 1.14722872, + "balance_loss_mlp": 1.02403295, + "epoch": 0.5403715505320748, + "flos": 22896017350560.0, + "grad_norm": 2.7754350961077097, + "language_loss": 0.7582047, + "learning_rate": 1.8358219324533217e-06, + "loss": 0.78605491, + "num_input_tokens_seen": 97013685, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.51757812, + "step": 4494, + "time_per_iteration": 3.8163599967956543 + }, + { + "auxiliary_loss_clip": 0.01505586, + "auxiliary_loss_mlp": 0.01269299, + "balance_loss_clip": 1.14584267, + "balance_loss_mlp": 1.02248836, + "epoch": 0.5404917934227139, + "flos": 30226617774720.0, + "grad_norm": 3.4083947451517353, + "language_loss": 0.70225799, + "learning_rate": 1.8350456023078292e-06, + "loss": 0.73000681, + "num_input_tokens_seen": 97036060, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.46484375, + "step": 4495, + "time_per_iteration": 3.2244679927825928 + }, + { + "auxiliary_loss_clip": 0.01509873, + "auxiliary_loss_mlp": 0.01280349, + "balance_loss_clip": 1.15131843, + "balance_loss_mlp": 1.02609944, + "epoch": 0.540612036313353, + "flos": 19940587685280.0, + "grad_norm": 2.3163533850695046, + "language_loss": 0.78205884, + "learning_rate": 1.8342692971857874e-06, + "loss": 0.80996102, + "num_input_tokens_seen": 97055260, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.54101562, + "step": 4496, + "time_per_iteration": 3.1190028190612793 + }, + { + "auxiliary_loss_clip": 0.01505603, + "auxiliary_loss_mlp": 0.01280573, + "balance_loss_clip": 1.14514256, + "balance_loss_mlp": 1.02899361, + "epoch": 0.540732279203992, + "flos": 24281888232960.0, + "grad_norm": 3.095374064727645, + "language_loss": 0.71597838, + "learning_rate": 1.833493017204962e-06, + "loss": 0.7438401, + "num_input_tokens_seen": 97075365, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.51367188, + "step": 4497, + "time_per_iteration": 3.135603189468384 + }, + { + "auxiliary_loss_clip": 0.0150773, + "auxiliary_loss_mlp": 0.01281931, + "balance_loss_clip": 1.14778197, + "balance_loss_mlp": 1.03111422, + "epoch": 0.5408525220946312, + "flos": 20195277829920.0, + "grad_norm": 2.3535458451770497, + "language_loss": 0.77929175, + "learning_rate": 1.8327167624831134e-06, + "loss": 0.80718839, + "num_input_tokens_seen": 97093095, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.50585938, + "step": 4498, + "time_per_iteration": 3.0974557399749756 + }, + { + "auxiliary_loss_clip": 0.01510508, + "auxiliary_loss_mlp": 0.01278197, + "balance_loss_clip": 1.15072417, + "balance_loss_mlp": 1.02566409, + "epoch": 0.5409727649852702, + "flos": 24136773635520.0, + "grad_norm": 1.867009829334492, + "language_loss": 0.70631945, + "learning_rate": 1.831940533137999e-06, + "loss": 0.73420656, + "num_input_tokens_seen": 97112000, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.5234375, + "step": 4499, + "time_per_iteration": 3.0169906616210938 + }, + { + "auxiliary_loss_clip": 0.01506669, + "auxiliary_loss_mlp": 0.01273119, + "balance_loss_clip": 1.1468401, + "balance_loss_mlp": 1.02020454, + "epoch": 0.5410930078759093, + "flos": 23914853785440.0, + "grad_norm": 1.742157207149038, + "language_loss": 0.72398627, + "learning_rate": 1.8311643292873718e-06, + "loss": 0.75178421, + "num_input_tokens_seen": 97130820, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.52734375, + "step": 4500, + "time_per_iteration": 3.07528018951416 + }, + { + "auxiliary_loss_clip": 0.01497873, + "auxiliary_loss_mlp": 0.01280212, + "balance_loss_clip": 1.13660705, + "balance_loss_mlp": 1.03168416, + "epoch": 0.5412132507665485, + "flos": 21107231616960.0, + "grad_norm": 1.9699293059994185, + "language_loss": 0.87939358, + "learning_rate": 1.8303881510489818e-06, + "loss": 0.90717447, + "num_input_tokens_seen": 97149210, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.48242188, + "step": 4501, + "time_per_iteration": 3.045116901397705 + }, + { + "auxiliary_loss_clip": 0.01502419, + "auxiliary_loss_mlp": 0.01278228, + "balance_loss_clip": 1.1423769, + "balance_loss_mlp": 1.02283406, + "epoch": 0.5413334936571875, + "flos": 30229993380960.0, + "grad_norm": 2.1230860062392165, + "language_loss": 0.69279832, + "learning_rate": 1.829611998540574e-06, + "loss": 0.72060478, + "num_input_tokens_seen": 97170415, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.55273438, + "step": 4502, + "time_per_iteration": 3.0720553398132324 + }, + { + "auxiliary_loss_clip": 0.0150055, + "auxiliary_loss_mlp": 0.01273109, + "balance_loss_clip": 1.13936019, + "balance_loss_mlp": 1.02172089, + "epoch": 0.5414537365478266, + "flos": 24281926161120.0, + "grad_norm": 1.871556687163109, + "language_loss": 0.80141819, + "learning_rate": 1.8288358718798914e-06, + "loss": 0.82915479, + "num_input_tokens_seen": 97189605, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.51171875, + "step": 4503, + "time_per_iteration": 3.1049885749816895 + }, + { + "auxiliary_loss_clip": 0.01505645, + "auxiliary_loss_mlp": 0.01281011, + "balance_loss_clip": 1.14340055, + "balance_loss_mlp": 1.02885962, + "epoch": 0.5415739794384657, + "flos": 16656431378400.0, + "grad_norm": 1.8677137806443276, + "language_loss": 0.72493124, + "learning_rate": 1.8280597711846703e-06, + "loss": 0.75279778, + "num_input_tokens_seen": 97207845, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.51953125, + "step": 4504, + "time_per_iteration": 2.982271194458008 + }, + { + "auxiliary_loss_clip": 0.0150424, + "auxiliary_loss_mlp": 0.01270292, + "balance_loss_clip": 1.1427902, + "balance_loss_mlp": 1.02119219, + "epoch": 0.5416942223291048, + "flos": 23187991240800.0, + "grad_norm": 2.50788038026167, + "language_loss": 0.83488762, + "learning_rate": 1.8272836965726455e-06, + "loss": 0.86263293, + "num_input_tokens_seen": 97226780, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.48828125, + "step": 4505, + "time_per_iteration": 3.1343491077423096 + }, + { + "auxiliary_loss_clip": 0.01506252, + "auxiliary_loss_mlp": 0.0127769, + "balance_loss_clip": 1.14408195, + "balance_loss_mlp": 1.02553844, + "epoch": 0.5418144652197439, + "flos": 20305346443200.0, + "grad_norm": 2.0512991539517476, + "language_loss": 0.78241193, + "learning_rate": 1.8265076481615461e-06, + "loss": 0.81025136, + "num_input_tokens_seen": 97246695, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.51953125, + "step": 4506, + "time_per_iteration": 3.0309276580810547 + }, + { + "auxiliary_loss_clip": 0.01506648, + "auxiliary_loss_mlp": 0.01279079, + "balance_loss_clip": 1.14644134, + "balance_loss_mlp": 1.02482915, + "epoch": 0.541934708110383, + "flos": 12460283356320.0, + "grad_norm": 4.0664916477392135, + "language_loss": 0.87466216, + "learning_rate": 1.8257316260690987e-06, + "loss": 0.90251946, + "num_input_tokens_seen": 97264480, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.54101562, + "step": 4507, + "time_per_iteration": 3.0664379596710205 + }, + { + "auxiliary_loss_clip": 0.01505826, + "auxiliary_loss_mlp": 0.01266611, + "balance_loss_clip": 1.14605808, + "balance_loss_mlp": 1.01751101, + "epoch": 0.5420549510010221, + "flos": 21256063174080.0, + "grad_norm": 1.6078503777142992, + "language_loss": 0.75731134, + "learning_rate": 1.8249556304130254e-06, + "loss": 0.78503573, + "num_input_tokens_seen": 97285760, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.48828125, + "step": 4508, + "time_per_iteration": 3.027008533477783 + }, + { + "auxiliary_loss_clip": 0.01508136, + "auxiliary_loss_mlp": 0.0128128, + "balance_loss_clip": 1.14789319, + "balance_loss_mlp": 1.02703047, + "epoch": 0.5421751938916611, + "flos": 29492738520480.0, + "grad_norm": 2.1971609567744275, + "language_loss": 0.68479896, + "learning_rate": 1.824179661311044e-06, + "loss": 0.7126931, + "num_input_tokens_seen": 97304510, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.54101562, + "step": 4509, + "time_per_iteration": 3.0330848693847656 + }, + { + "auxiliary_loss_clip": 0.01502539, + "auxiliary_loss_mlp": 0.01276047, + "balance_loss_clip": 1.14155185, + "balance_loss_mlp": 1.02504027, + "epoch": 0.5422954367823003, + "flos": 18736242798240.0, + "grad_norm": 2.400270245529671, + "language_loss": 0.80136895, + "learning_rate": 1.823403718880868e-06, + "loss": 0.82915479, + "num_input_tokens_seen": 97323270, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.5078125, + "step": 4510, + "time_per_iteration": 2.9654266834259033 + }, + { + "auxiliary_loss_clip": 0.01501159, + "auxiliary_loss_mlp": 0.01286434, + "balance_loss_clip": 1.14019501, + "balance_loss_mlp": 1.03294754, + "epoch": 0.5424156796729394, + "flos": 39972204047520.0, + "grad_norm": 1.925138337735092, + "language_loss": 0.66677284, + "learning_rate": 1.822627803240207e-06, + "loss": 0.69464886, + "num_input_tokens_seen": 97345600, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.53320312, + "step": 4511, + "time_per_iteration": 3.098128318786621 + }, + { + "auxiliary_loss_clip": 0.01497631, + "auxiliary_loss_mlp": 0.01271883, + "balance_loss_clip": 1.13735771, + "balance_loss_mlp": 1.01915932, + "epoch": 0.5425359225635784, + "flos": 11548139928480.0, + "grad_norm": 2.363587757016273, + "language_loss": 0.85387087, + "learning_rate": 1.8218519145067675e-06, + "loss": 0.88156599, + "num_input_tokens_seen": 97361220, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.52539062, + "step": 4512, + "time_per_iteration": 2.8989686965942383 + }, + { + "auxiliary_loss_clip": 0.01504973, + "auxiliary_loss_mlp": 0.01281855, + "balance_loss_clip": 1.14370131, + "balance_loss_mlp": 1.02817786, + "epoch": 0.5426561654542175, + "flos": 20231765084160.0, + "grad_norm": 1.838256503698568, + "language_loss": 0.89652985, + "learning_rate": 1.8210760527982508e-06, + "loss": 0.92439818, + "num_input_tokens_seen": 97381505, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.53515625, + "step": 4513, + "time_per_iteration": 3.0105814933776855 + }, + { + "auxiliary_loss_clip": 0.0151146, + "auxiliary_loss_mlp": 0.01274387, + "balance_loss_clip": 1.15122712, + "balance_loss_mlp": 1.02509665, + "epoch": 0.5427764083448566, + "flos": 21873843237600.0, + "grad_norm": 1.901014966337384, + "language_loss": 0.74838299, + "learning_rate": 1.8203002182323552e-06, + "loss": 0.77624142, + "num_input_tokens_seen": 97399060, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.49023438, + "step": 4514, + "time_per_iteration": 3.796247959136963 + }, + { + "auxiliary_loss_clip": 0.01502127, + "auxiliary_loss_mlp": 0.01279541, + "balance_loss_clip": 1.13951731, + "balance_loss_mlp": 1.02872467, + "epoch": 0.5428966512354957, + "flos": 19642697002080.0, + "grad_norm": 4.107346625447289, + "language_loss": 0.75520265, + "learning_rate": 1.819524410926773e-06, + "loss": 0.78301936, + "num_input_tokens_seen": 97416740, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.50585938, + "step": 4515, + "time_per_iteration": 3.9680404663085938 + }, + { + "auxiliary_loss_clip": 0.01500987, + "auxiliary_loss_mlp": 0.01284943, + "balance_loss_clip": 1.14035892, + "balance_loss_mlp": 1.03183746, + "epoch": 0.5430168941261347, + "flos": 22384361371680.0, + "grad_norm": 3.5718172839402613, + "language_loss": 0.7735213, + "learning_rate": 1.8187486309991944e-06, + "loss": 0.80138057, + "num_input_tokens_seen": 97437620, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.52929688, + "step": 4516, + "time_per_iteration": 3.0070266723632812 + }, + { + "auxiliary_loss_clip": 0.01510655, + "auxiliary_loss_mlp": 0.0127319, + "balance_loss_clip": 1.1513083, + "balance_loss_mlp": 1.0200851, + "epoch": 0.5431371370167739, + "flos": 18765827127360.0, + "grad_norm": 1.9940941733672253, + "language_loss": 0.77327502, + "learning_rate": 1.817972878567304e-06, + "loss": 0.80111349, + "num_input_tokens_seen": 97456275, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.52929688, + "step": 4517, + "time_per_iteration": 3.0248231887817383 + }, + { + "auxiliary_loss_clip": 0.01503247, + "auxiliary_loss_mlp": 0.01284325, + "balance_loss_clip": 1.14236999, + "balance_loss_mlp": 1.03331792, + "epoch": 0.543257379907413, + "flos": 18807965677440.0, + "grad_norm": 2.2140781919761716, + "language_loss": 0.76353884, + "learning_rate": 1.8171971537487834e-06, + "loss": 0.7914145, + "num_input_tokens_seen": 97474925, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.5078125, + "step": 4518, + "time_per_iteration": 3.930114984512329 + }, + { + "auxiliary_loss_clip": 0.01503298, + "auxiliary_loss_mlp": 0.01277311, + "balance_loss_clip": 1.14168108, + "balance_loss_mlp": 1.02630353, + "epoch": 0.543377622798052, + "flos": 17495296872480.0, + "grad_norm": 2.111831810212059, + "language_loss": 0.80795509, + "learning_rate": 1.8164214566613093e-06, + "loss": 0.83576119, + "num_input_tokens_seen": 97493550, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.5078125, + "step": 4519, + "time_per_iteration": 3.092907428741455 + }, + { + "auxiliary_loss_clip": 0.01501266, + "auxiliary_loss_mlp": 0.01276628, + "balance_loss_clip": 1.13904142, + "balance_loss_mlp": 1.02504873, + "epoch": 0.5434978656886912, + "flos": 18991312224480.0, + "grad_norm": 2.7244981589195625, + "language_loss": 0.65623677, + "learning_rate": 1.8156457874225547e-06, + "loss": 0.68401575, + "num_input_tokens_seen": 97512010, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.51367188, + "step": 4520, + "time_per_iteration": 3.039172410964966 + }, + { + "auxiliary_loss_clip": 0.01499487, + "auxiliary_loss_mlp": 0.01270416, + "balance_loss_clip": 1.13626075, + "balance_loss_mlp": 1.01979065, + "epoch": 0.5436181085793302, + "flos": 17276866413120.0, + "grad_norm": 1.9566809931144118, + "language_loss": 0.80968714, + "learning_rate": 1.814870146150187e-06, + "loss": 0.83738613, + "num_input_tokens_seen": 97530120, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.50390625, + "step": 4521, + "time_per_iteration": 3.7499582767486572 + }, + { + "auxiliary_loss_clip": 0.01496821, + "auxiliary_loss_mlp": 0.01283736, + "balance_loss_clip": 1.13314128, + "balance_loss_mlp": 1.03024912, + "epoch": 0.5437383514699693, + "flos": 19100584346400.0, + "grad_norm": 2.4285262440456057, + "language_loss": 0.7899608, + "learning_rate": 1.814094532961871e-06, + "loss": 0.81776631, + "num_input_tokens_seen": 97548695, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.53320312, + "step": 4522, + "time_per_iteration": 3.139798641204834 + }, + { + "auxiliary_loss_clip": 0.01499669, + "auxiliary_loss_mlp": 0.01299621, + "balance_loss_clip": 1.13838196, + "balance_loss_mlp": 1.04479909, + "epoch": 0.5438585943606085, + "flos": 22604991664320.0, + "grad_norm": 2.2014487927065973, + "language_loss": 0.83488786, + "learning_rate": 1.8133189479752666e-06, + "loss": 0.86288071, + "num_input_tokens_seen": 97567625, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.546875, + "step": 4523, + "time_per_iteration": 3.014524221420288 + }, + { + "auxiliary_loss_clip": 0.01498431, + "auxiliary_loss_mlp": 0.01269057, + "balance_loss_clip": 1.13586175, + "balance_loss_mlp": 1.01881337, + "epoch": 0.5439788372512475, + "flos": 21800641160160.0, + "grad_norm": 2.9362799440831044, + "language_loss": 0.81959337, + "learning_rate": 1.8125433913080292e-06, + "loss": 0.84726822, + "num_input_tokens_seen": 97585325, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.5, + "step": 4524, + "time_per_iteration": 2.9760539531707764 + }, + { + "auxiliary_loss_clip": 0.01495987, + "auxiliary_loss_mlp": 0.01276804, + "balance_loss_clip": 1.13440871, + "balance_loss_mlp": 1.02808535, + "epoch": 0.5440990801418866, + "flos": 16401248167680.0, + "grad_norm": 2.2661139105499313, + "language_loss": 0.82720381, + "learning_rate": 1.811767863077811e-06, + "loss": 0.85493171, + "num_input_tokens_seen": 97604275, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.484375, + "step": 4525, + "time_per_iteration": 2.993973731994629 + }, + { + "auxiliary_loss_clip": 0.01502188, + "auxiliary_loss_mlp": 0.01282201, + "balance_loss_clip": 1.13991499, + "balance_loss_mlp": 1.03348231, + "epoch": 0.5442193230325257, + "flos": 21617749751040.0, + "grad_norm": 2.3683756046485516, + "language_loss": 0.78407931, + "learning_rate": 1.8109923634022577e-06, + "loss": 0.81192315, + "num_input_tokens_seen": 97624300, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.484375, + "step": 4526, + "time_per_iteration": 3.0027801990509033 + }, + { + "auxiliary_loss_clip": 0.0149638, + "auxiliary_loss_mlp": 0.01278736, + "balance_loss_clip": 1.13305092, + "balance_loss_mlp": 1.02544022, + "epoch": 0.5443395659231648, + "flos": 15482694880800.0, + "grad_norm": 3.7475153124514335, + "language_loss": 0.8617394, + "learning_rate": 1.8102168923990128e-06, + "loss": 0.88949054, + "num_input_tokens_seen": 97637845, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.53125, + "step": 4527, + "time_per_iteration": 2.970939874649048 + }, + { + "auxiliary_loss_clip": 0.0150052, + "auxiliary_loss_mlp": 0.01280418, + "balance_loss_clip": 1.13640046, + "balance_loss_mlp": 1.02979279, + "epoch": 0.5444598088138038, + "flos": 18772426627200.0, + "grad_norm": 1.9583303614707315, + "language_loss": 0.80248374, + "learning_rate": 1.809441450185714e-06, + "loss": 0.83029312, + "num_input_tokens_seen": 97656330, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.50390625, + "step": 4528, + "time_per_iteration": 3.0042049884796143 + }, + { + "auxiliary_loss_clip": 0.01495149, + "auxiliary_loss_mlp": 0.01271939, + "balance_loss_clip": 1.13265395, + "balance_loss_mlp": 1.02036023, + "epoch": 0.544580051704443, + "flos": 21144477434400.0, + "grad_norm": 2.1648243130592686, + "language_loss": 0.73511833, + "learning_rate": 1.8086660368799958e-06, + "loss": 0.76278925, + "num_input_tokens_seen": 97674380, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.51367188, + "step": 4529, + "time_per_iteration": 2.9288504123687744 + }, + { + "auxiliary_loss_clip": 0.01501481, + "auxiliary_loss_mlp": 0.01279137, + "balance_loss_clip": 1.13784552, + "balance_loss_mlp": 1.02565086, + "epoch": 0.5447002945950821, + "flos": 32494402977120.0, + "grad_norm": 2.010920196240836, + "language_loss": 0.77320969, + "learning_rate": 1.807890652599488e-06, + "loss": 0.80101585, + "num_input_tokens_seen": 97698765, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.53320312, + "step": 4530, + "time_per_iteration": 3.186420202255249 + }, + { + "auxiliary_loss_clip": 0.01496873, + "auxiliary_loss_mlp": 0.01274039, + "balance_loss_clip": 1.13354397, + "balance_loss_mlp": 1.02417684, + "epoch": 0.5448205374857211, + "flos": 11799719964000.0, + "grad_norm": 2.184817989184994, + "language_loss": 0.82779121, + "learning_rate": 1.8071152974618156e-06, + "loss": 0.85550034, + "num_input_tokens_seen": 97716565, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.49609375, + "step": 4531, + "time_per_iteration": 3.0744869709014893 + }, + { + "auxiliary_loss_clip": 0.01496466, + "auxiliary_loss_mlp": 0.01273894, + "balance_loss_clip": 1.13274503, + "balance_loss_mlp": 1.02441299, + "epoch": 0.5449407803763603, + "flos": 24136053000480.0, + "grad_norm": 34.113971721979674, + "language_loss": 0.78222001, + "learning_rate": 1.806339971584599e-06, + "loss": 0.80992365, + "num_input_tokens_seen": 97733225, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.4921875, + "step": 4532, + "time_per_iteration": 2.981283187866211 + }, + { + "auxiliary_loss_clip": 0.01500143, + "auxiliary_loss_mlp": 0.01271845, + "balance_loss_clip": 1.13614631, + "balance_loss_mlp": 1.02350843, + "epoch": 0.5450610232669993, + "flos": 23260814036640.0, + "grad_norm": 1.9486514882290775, + "language_loss": 0.85337025, + "learning_rate": 1.8055646750854546e-06, + "loss": 0.88109016, + "num_input_tokens_seen": 97752735, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.48046875, + "step": 4533, + "time_per_iteration": 2.990574359893799 + }, + { + "auxiliary_loss_clip": 0.01500254, + "auxiliary_loss_mlp": 0.01273282, + "balance_loss_clip": 1.13683498, + "balance_loss_mlp": 1.02113104, + "epoch": 0.5451812661576384, + "flos": 17787005265600.0, + "grad_norm": 4.837668734648742, + "language_loss": 0.82306683, + "learning_rate": 1.8047894080819945e-06, + "loss": 0.85080218, + "num_input_tokens_seen": 97769985, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.51953125, + "step": 4534, + "time_per_iteration": 2.969944477081299 + }, + { + "auxiliary_loss_clip": 0.01497366, + "auxiliary_loss_mlp": 0.01206497, + "balance_loss_clip": 1.13857996, + "balance_loss_mlp": 1.00279236, + "epoch": 0.5453015090482776, + "flos": 71070002108640.0, + "grad_norm": 0.7645363725395032, + "language_loss": 0.63157338, + "learning_rate": 1.8040141706918258e-06, + "loss": 0.65861201, + "num_input_tokens_seen": 97831225, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.0390625, + "step": 4535, + "time_per_iteration": 3.6759631633758545 + }, + { + "auxiliary_loss_clip": 0.01504035, + "auxiliary_loss_mlp": 0.01270554, + "balance_loss_clip": 1.14251423, + "balance_loss_mlp": 1.02069092, + "epoch": 0.5454217519389166, + "flos": 25554314895840.0, + "grad_norm": 2.748730135003294, + "language_loss": 0.76859498, + "learning_rate": 1.8032389630325525e-06, + "loss": 0.79634088, + "num_input_tokens_seen": 97849975, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.49609375, + "step": 4536, + "time_per_iteration": 3.0727713108062744 + }, + { + "auxiliary_loss_clip": 0.01495397, + "auxiliary_loss_mlp": 0.01261174, + "balance_loss_clip": 1.1327461, + "balance_loss_mlp": 1.0132184, + "epoch": 0.5455419948295557, + "flos": 23660542922400.0, + "grad_norm": 1.9877422493099397, + "language_loss": 0.75787175, + "learning_rate": 1.8024637852217707e-06, + "loss": 0.78543746, + "num_input_tokens_seen": 97869700, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.4765625, + "step": 4537, + "time_per_iteration": 3.0352394580841064 + }, + { + "auxiliary_loss_clip": 0.01497001, + "auxiliary_loss_mlp": 0.01272069, + "balance_loss_clip": 1.13284254, + "balance_loss_mlp": 1.02201581, + "epoch": 0.5456622377201948, + "flos": 23406232059360.0, + "grad_norm": 1.9484899037284924, + "language_loss": 0.85009187, + "learning_rate": 1.8016886373770766e-06, + "loss": 0.87778258, + "num_input_tokens_seen": 97888215, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.49804688, + "step": 4538, + "time_per_iteration": 3.136699914932251 + }, + { + "auxiliary_loss_clip": 0.01501548, + "auxiliary_loss_mlp": 0.01272489, + "balance_loss_clip": 1.13935852, + "balance_loss_mlp": 1.0233891, + "epoch": 0.5457824806108339, + "flos": 23990331552480.0, + "grad_norm": 2.504917260251588, + "language_loss": 0.79090452, + "learning_rate": 1.8009135196160579e-06, + "loss": 0.818645, + "num_input_tokens_seen": 97907090, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.48828125, + "step": 4539, + "time_per_iteration": 2.9821622371673584 + }, + { + "auxiliary_loss_clip": 0.01496428, + "auxiliary_loss_mlp": 0.01280741, + "balance_loss_clip": 1.13368523, + "balance_loss_mlp": 1.03125954, + "epoch": 0.545902723501473, + "flos": 22568125128480.0, + "grad_norm": 1.8828562175137045, + "language_loss": 0.84341228, + "learning_rate": 1.8001384320563e-06, + "loss": 0.87118393, + "num_input_tokens_seen": 97927345, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.4921875, + "step": 4540, + "time_per_iteration": 3.0396809577941895 + }, + { + "auxiliary_loss_clip": 0.01500026, + "auxiliary_loss_mlp": 0.01211746, + "balance_loss_clip": 1.14192498, + "balance_loss_mlp": 1.00804138, + "epoch": 0.5460229663921121, + "flos": 55203280964640.0, + "grad_norm": 0.7779895414495375, + "language_loss": 0.57880938, + "learning_rate": 1.7993633748153833e-06, + "loss": 0.60592711, + "num_input_tokens_seen": 97981950, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.0390625, + "step": 4541, + "time_per_iteration": 4.272720575332642 + }, + { + "auxiliary_loss_clip": 0.01502537, + "auxiliary_loss_mlp": 0.01274368, + "balance_loss_clip": 1.14029932, + "balance_loss_mlp": 1.02259779, + "epoch": 0.5461432092827512, + "flos": 15415675093440.0, + "grad_norm": 2.0108282040117524, + "language_loss": 0.72807992, + "learning_rate": 1.7985883480108834e-06, + "loss": 0.755849, + "num_input_tokens_seen": 97999585, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.515625, + "step": 4542, + "time_per_iteration": 4.12929368019104 + }, + { + "auxiliary_loss_clip": 0.0149636, + "auxiliary_loss_mlp": 0.0127822, + "balance_loss_clip": 1.13322735, + "balance_loss_mlp": 1.02835727, + "epoch": 0.5462634521733902, + "flos": 24027236016480.0, + "grad_norm": 1.7009925993893462, + "language_loss": 0.71986026, + "learning_rate": 1.797813351760371e-06, + "loss": 0.74760604, + "num_input_tokens_seen": 98021290, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.49609375, + "step": 4543, + "time_per_iteration": 3.2736096382141113 + }, + { + "auxiliary_loss_clip": 0.01497455, + "auxiliary_loss_mlp": 0.01279988, + "balance_loss_clip": 1.13576829, + "balance_loss_mlp": 1.02592933, + "epoch": 0.5463836950640293, + "flos": 22823194554720.0, + "grad_norm": 2.119786373462812, + "language_loss": 0.78170609, + "learning_rate": 1.7970383861814116e-06, + "loss": 0.80948043, + "num_input_tokens_seen": 98041060, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.5390625, + "step": 4544, + "time_per_iteration": 3.093449831008911 + }, + { + "auxiliary_loss_clip": 0.01503364, + "auxiliary_loss_mlp": 0.01276211, + "balance_loss_clip": 1.14104176, + "balance_loss_mlp": 1.0269208, + "epoch": 0.5465039379546685, + "flos": 20450347256160.0, + "grad_norm": 4.583114962800385, + "language_loss": 0.74248099, + "learning_rate": 1.7962634513915684e-06, + "loss": 0.77027667, + "num_input_tokens_seen": 98058410, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.49023438, + "step": 4545, + "time_per_iteration": 3.8611109256744385 + }, + { + "auxiliary_loss_clip": 0.01498913, + "auxiliary_loss_mlp": 0.01271353, + "balance_loss_clip": 1.13729346, + "balance_loss_mlp": 1.01977396, + "epoch": 0.5466241808453075, + "flos": 17344986117120.0, + "grad_norm": 1.8592414202232304, + "language_loss": 0.79606664, + "learning_rate": 1.7954885475083969e-06, + "loss": 0.82376933, + "num_input_tokens_seen": 98076080, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.51367188, + "step": 4546, + "time_per_iteration": 2.9347875118255615 + }, + { + "auxiliary_loss_clip": 0.01503754, + "auxiliary_loss_mlp": 0.01279892, + "balance_loss_clip": 1.14090562, + "balance_loss_mlp": 1.02850294, + "epoch": 0.5467444237359466, + "flos": 21619228949280.0, + "grad_norm": 2.3523416413909874, + "language_loss": 0.72836435, + "learning_rate": 1.7947136746494513e-06, + "loss": 0.75620079, + "num_input_tokens_seen": 98096995, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.51171875, + "step": 4547, + "time_per_iteration": 3.061720609664917 + }, + { + "auxiliary_loss_clip": 0.01502941, + "auxiliary_loss_mlp": 0.0127125, + "balance_loss_clip": 1.14066637, + "balance_loss_mlp": 1.02157819, + "epoch": 0.5468646666265857, + "flos": 24172767823680.0, + "grad_norm": 2.423821906392548, + "language_loss": 0.88434613, + "learning_rate": 1.793938832932277e-06, + "loss": 0.91208804, + "num_input_tokens_seen": 98115105, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.49414062, + "step": 4548, + "time_per_iteration": 3.8627920150756836 + }, + { + "auxiliary_loss_clip": 0.01498758, + "auxiliary_loss_mlp": 0.01277507, + "balance_loss_clip": 1.13692689, + "balance_loss_mlp": 1.02611816, + "epoch": 0.5469849095172248, + "flos": 27529481429280.0, + "grad_norm": 1.9718177295883428, + "language_loss": 0.70506477, + "learning_rate": 1.7931640224744185e-06, + "loss": 0.73282743, + "num_input_tokens_seen": 98135655, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.51171875, + "step": 4549, + "time_per_iteration": 2.964351177215576 + }, + { + "auxiliary_loss_clip": 0.01496868, + "auxiliary_loss_mlp": 0.0127294, + "balance_loss_clip": 1.13393283, + "balance_loss_mlp": 1.02269602, + "epoch": 0.5471051524078638, + "flos": 27967176767520.0, + "grad_norm": 1.5667520509351656, + "language_loss": 0.73804319, + "learning_rate": 1.7923892433934127e-06, + "loss": 0.76574123, + "num_input_tokens_seen": 98156730, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.5, + "step": 4550, + "time_per_iteration": 3.0096688270568848 + }, + { + "auxiliary_loss_clip": 0.01507041, + "auxiliary_loss_mlp": 0.0127939, + "balance_loss_clip": 1.14459872, + "balance_loss_mlp": 1.02857327, + "epoch": 0.547225395298503, + "flos": 18152484658560.0, + "grad_norm": 1.9101102413717221, + "language_loss": 0.78989017, + "learning_rate": 1.7916144958067939e-06, + "loss": 0.81775451, + "num_input_tokens_seen": 98174590, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.50585938, + "step": 4551, + "time_per_iteration": 2.988553524017334 + }, + { + "auxiliary_loss_clip": 0.01503242, + "auxiliary_loss_mlp": 0.01278315, + "balance_loss_clip": 1.14216733, + "balance_loss_mlp": 1.02864265, + "epoch": 0.5473456381891421, + "flos": 21363666456960.0, + "grad_norm": 2.273877901050014, + "language_loss": 0.79467112, + "learning_rate": 1.7908397798320905e-06, + "loss": 0.82248664, + "num_input_tokens_seen": 98194325, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.49414062, + "step": 4552, + "time_per_iteration": 3.0544257164001465 + }, + { + "auxiliary_loss_clip": 0.01508257, + "auxiliary_loss_mlp": 0.01278243, + "balance_loss_clip": 1.14744151, + "balance_loss_mlp": 1.02227676, + "epoch": 0.5474658810797811, + "flos": 19933874400960.0, + "grad_norm": 1.9307983201010215, + "language_loss": 0.75005805, + "learning_rate": 1.7900650955868265e-06, + "loss": 0.77792311, + "num_input_tokens_seen": 98213970, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.55859375, + "step": 4553, + "time_per_iteration": 3.08530330657959 + }, + { + "auxiliary_loss_clip": 0.01511343, + "auxiliary_loss_mlp": 0.01273097, + "balance_loss_clip": 1.14952755, + "balance_loss_mlp": 1.02170873, + "epoch": 0.5475861239704203, + "flos": 50479964346240.0, + "grad_norm": 1.5228163478319185, + "language_loss": 0.7657702, + "learning_rate": 1.7892904431885202e-06, + "loss": 0.79361457, + "num_input_tokens_seen": 98241145, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.51171875, + "step": 4554, + "time_per_iteration": 3.2882730960845947 + }, + { + "auxiliary_loss_clip": 0.01504068, + "auxiliary_loss_mlp": 0.01273111, + "balance_loss_clip": 1.14312732, + "balance_loss_mlp": 1.0259186, + "epoch": 0.5477063668610593, + "flos": 20707275162240.0, + "grad_norm": 2.5166564703056156, + "language_loss": 0.75535238, + "learning_rate": 1.788515822754686e-06, + "loss": 0.78312421, + "num_input_tokens_seen": 98261565, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.46875, + "step": 4555, + "time_per_iteration": 3.1744132041931152 + }, + { + "auxiliary_loss_clip": 0.01512818, + "auxiliary_loss_mlp": 0.01290685, + "balance_loss_clip": 1.15085781, + "balance_loss_mlp": 1.03872395, + "epoch": 0.5478266097516984, + "flos": 19611785187360.0, + "grad_norm": 2.5249557504877234, + "language_loss": 0.78258014, + "learning_rate": 1.7877412344028335e-06, + "loss": 0.81061518, + "num_input_tokens_seen": 98281370, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.51757812, + "step": 4556, + "time_per_iteration": 3.1222596168518066 + }, + { + "auxiliary_loss_clip": 0.01515968, + "auxiliary_loss_mlp": 0.01271409, + "balance_loss_clip": 1.15583444, + "balance_loss_mlp": 1.02192807, + "epoch": 0.5479468526423376, + "flos": 12898282119840.0, + "grad_norm": 2.2671430614898807, + "language_loss": 0.77459735, + "learning_rate": 1.7869666782504668e-06, + "loss": 0.80247104, + "num_input_tokens_seen": 98297950, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.4921875, + "step": 4557, + "time_per_iteration": 3.0715768337249756 + }, + { + "auxiliary_loss_clip": 0.01498944, + "auxiliary_loss_mlp": 0.01271582, + "balance_loss_clip": 1.13696277, + "balance_loss_mlp": 1.02152824, + "epoch": 0.5480670955329766, + "flos": 18590824775520.0, + "grad_norm": 2.059723472085122, + "language_loss": 0.6875726, + "learning_rate": 1.7861921544150867e-06, + "loss": 0.71527785, + "num_input_tokens_seen": 98316800, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.49804688, + "step": 4558, + "time_per_iteration": 3.1067311763763428 + }, + { + "auxiliary_loss_clip": 0.01505993, + "auxiliary_loss_mlp": 0.01281798, + "balance_loss_clip": 1.14418113, + "balance_loss_mlp": 1.0325079, + "epoch": 0.5481873384236157, + "flos": 15956080981920.0, + "grad_norm": 2.047936318194825, + "language_loss": 0.76017261, + "learning_rate": 1.7854176630141856e-06, + "loss": 0.78805053, + "num_input_tokens_seen": 98333935, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.49023438, + "step": 4559, + "time_per_iteration": 3.0827226638793945 + }, + { + "auxiliary_loss_clip": 0.01513159, + "auxiliary_loss_mlp": 0.01286407, + "balance_loss_clip": 1.15205717, + "balance_loss_mlp": 1.03234792, + "epoch": 0.5483075813142548, + "flos": 22786024593600.0, + "grad_norm": 3.858794138387785, + "language_loss": 0.84216291, + "learning_rate": 1.784643204165255e-06, + "loss": 0.87015855, + "num_input_tokens_seen": 98353255, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.5390625, + "step": 4560, + "time_per_iteration": 3.0586647987365723 + }, + { + "auxiliary_loss_clip": 0.01507871, + "auxiliary_loss_mlp": 0.01280832, + "balance_loss_clip": 1.14590955, + "balance_loss_mlp": 1.02906203, + "epoch": 0.5484278242048939, + "flos": 19319204446560.0, + "grad_norm": 2.483338577080452, + "language_loss": 0.77787524, + "learning_rate": 1.7838687779857783e-06, + "loss": 0.80576223, + "num_input_tokens_seen": 98371130, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.515625, + "step": 4561, + "time_per_iteration": 3.124696731567383 + }, + { + "auxiliary_loss_clip": 0.01507549, + "auxiliary_loss_mlp": 0.01285163, + "balance_loss_clip": 1.1456542, + "balance_loss_mlp": 1.03644443, + "epoch": 0.5485480670955329, + "flos": 22817998396800.0, + "grad_norm": 2.022655506136383, + "language_loss": 0.64110947, + "learning_rate": 1.7830943845932366e-06, + "loss": 0.66903663, + "num_input_tokens_seen": 98390455, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.484375, + "step": 4562, + "time_per_iteration": 3.030266761779785 + }, + { + "auxiliary_loss_clip": 0.01507481, + "auxiliary_loss_mlp": 0.01271956, + "balance_loss_clip": 1.14450681, + "balance_loss_mlp": 1.02113962, + "epoch": 0.5486683099861721, + "flos": 22673338937280.0, + "grad_norm": 2.25327614458952, + "language_loss": 0.75251031, + "learning_rate": 1.7823200241051044e-06, + "loss": 0.78030473, + "num_input_tokens_seen": 98409370, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.50585938, + "step": 4563, + "time_per_iteration": 3.0885891914367676 + }, + { + "auxiliary_loss_clip": 0.01513344, + "auxiliary_loss_mlp": 0.01281944, + "balance_loss_clip": 1.15228832, + "balance_loss_mlp": 1.02998281, + "epoch": 0.5487885528768112, + "flos": 23151655699200.0, + "grad_norm": 1.9215612549470222, + "language_loss": 0.80370736, + "learning_rate": 1.7815456966388513e-06, + "loss": 0.83166021, + "num_input_tokens_seen": 98428465, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.51757812, + "step": 4564, + "time_per_iteration": 3.0576539039611816 + }, + { + "auxiliary_loss_clip": 0.01514782, + "auxiliary_loss_mlp": 0.01277676, + "balance_loss_clip": 1.15468657, + "balance_loss_mlp": 1.02437973, + "epoch": 0.5489087957674502, + "flos": 22056089868000.0, + "grad_norm": 2.6816408268075858, + "language_loss": 0.8114078, + "learning_rate": 1.780771402311943e-06, + "loss": 0.8393324, + "num_input_tokens_seen": 98447300, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.53125, + "step": 4565, + "time_per_iteration": 2.9738593101501465 + }, + { + "auxiliary_loss_clip": 0.01509983, + "auxiliary_loss_mlp": 0.0128389, + "balance_loss_clip": 1.147066, + "balance_loss_mlp": 1.0296402, + "epoch": 0.5490290386580894, + "flos": 24318261702720.0, + "grad_norm": 2.863745065017485, + "language_loss": 0.78792906, + "learning_rate": 1.7799971412418374e-06, + "loss": 0.81586772, + "num_input_tokens_seen": 98468695, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.54101562, + "step": 4566, + "time_per_iteration": 3.1744890213012695 + }, + { + "auxiliary_loss_clip": 0.01513558, + "auxiliary_loss_mlp": 0.01277105, + "balance_loss_clip": 1.15180063, + "balance_loss_mlp": 1.02361834, + "epoch": 0.5491492815487284, + "flos": 18296840692800.0, + "grad_norm": 2.6813423102741734, + "language_loss": 0.74306309, + "learning_rate": 1.7792229135459918e-06, + "loss": 0.77096975, + "num_input_tokens_seen": 98485345, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.53320312, + "step": 4567, + "time_per_iteration": 3.084803819656372 + }, + { + "auxiliary_loss_clip": 0.01509408, + "auxiliary_loss_mlp": 0.01201134, + "balance_loss_clip": 1.15040231, + "balance_loss_mlp": 0.99742889, + "epoch": 0.5492695244393675, + "flos": 64556306773920.0, + "grad_norm": 0.749995221042471, + "language_loss": 0.61514127, + "learning_rate": 1.7784487193418538e-06, + "loss": 0.64224666, + "num_input_tokens_seen": 98543195, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.0390625, + "step": 4568, + "time_per_iteration": 3.375429153442383 + }, + { + "auxiliary_loss_clip": 0.01508581, + "auxiliary_loss_mlp": 0.01279866, + "balance_loss_clip": 1.14618504, + "balance_loss_mlp": 1.02542615, + "epoch": 0.5493897673300067, + "flos": 17381245802400.0, + "grad_norm": 2.760313598723923, + "language_loss": 0.61300039, + "learning_rate": 1.7776745587468698e-06, + "loss": 0.64088488, + "num_input_tokens_seen": 98560620, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.54296875, + "step": 4569, + "time_per_iteration": 3.9690439701080322 + }, + { + "auxiliary_loss_clip": 0.01506425, + "auxiliary_loss_mlp": 0.01282937, + "balance_loss_clip": 1.14415932, + "balance_loss_mlp": 1.03192973, + "epoch": 0.5495100102206457, + "flos": 19903834933920.0, + "grad_norm": 9.090342641517877, + "language_loss": 0.81468523, + "learning_rate": 1.7769004318784776e-06, + "loss": 0.84257889, + "num_input_tokens_seen": 98578265, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.5078125, + "step": 4570, + "time_per_iteration": 2.9803428649902344 + }, + { + "auxiliary_loss_clip": 0.0150141, + "auxiliary_loss_mlp": 0.01273526, + "balance_loss_clip": 1.13838816, + "balance_loss_mlp": 1.02270937, + "epoch": 0.5496302531112848, + "flos": 16729064533440.0, + "grad_norm": 2.127555301469879, + "language_loss": 0.80710334, + "learning_rate": 1.776126338854113e-06, + "loss": 0.8348527, + "num_input_tokens_seen": 98596055, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.50585938, + "step": 4571, + "time_per_iteration": 2.9511559009552 + }, + { + "auxiliary_loss_clip": 0.01509081, + "auxiliary_loss_mlp": 0.01275212, + "balance_loss_clip": 1.14775455, + "balance_loss_mlp": 1.02553988, + "epoch": 0.5497504960019239, + "flos": 24574696542720.0, + "grad_norm": 1.8102788747238046, + "language_loss": 0.8469069, + "learning_rate": 1.7753522797912044e-06, + "loss": 0.8747499, + "num_input_tokens_seen": 98616140, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.49414062, + "step": 4572, + "time_per_iteration": 3.0432965755462646 + }, + { + "auxiliary_loss_clip": 0.01506184, + "auxiliary_loss_mlp": 0.01283871, + "balance_loss_clip": 1.14276004, + "balance_loss_mlp": 1.03057551, + "epoch": 0.549870738892563, + "flos": 15452314060320.0, + "grad_norm": 3.079752674585612, + "language_loss": 0.70407724, + "learning_rate": 1.7745782548071765e-06, + "loss": 0.7319777, + "num_input_tokens_seen": 98633035, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.53125, + "step": 4573, + "time_per_iteration": 3.810610294342041 + }, + { + "auxiliary_loss_clip": 0.01511834, + "auxiliary_loss_mlp": 0.01278139, + "balance_loss_clip": 1.14975536, + "balance_loss_mlp": 1.02865744, + "epoch": 0.549990981783202, + "flos": 21071237428800.0, + "grad_norm": 1.7296479926693145, + "language_loss": 0.74235404, + "learning_rate": 1.7738042640194482e-06, + "loss": 0.77025378, + "num_input_tokens_seen": 98652700, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.4921875, + "step": 4574, + "time_per_iteration": 3.0056650638580322 + }, + { + "auxiliary_loss_clip": 0.01509729, + "auxiliary_loss_mlp": 0.01277213, + "balance_loss_clip": 1.14572561, + "balance_loss_mlp": 1.02563357, + "epoch": 0.5501112246738411, + "flos": 21397953877920.0, + "grad_norm": 1.6515603260407117, + "language_loss": 0.70451319, + "learning_rate": 1.7730303075454335e-06, + "loss": 0.73238266, + "num_input_tokens_seen": 98671590, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.51367188, + "step": 4575, + "time_per_iteration": 3.1081621646881104 + }, + { + "auxiliary_loss_clip": 0.01511808, + "auxiliary_loss_mlp": 0.01273917, + "balance_loss_clip": 1.1486119, + "balance_loss_mlp": 1.02195621, + "epoch": 0.5502314675644803, + "flos": 17458847546400.0, + "grad_norm": 2.0303649025499535, + "language_loss": 0.85281193, + "learning_rate": 1.7722563855025402e-06, + "loss": 0.88066918, + "num_input_tokens_seen": 98689620, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.51757812, + "step": 4576, + "time_per_iteration": 3.8152756690979004 + }, + { + "auxiliary_loss_clip": 0.01507017, + "auxiliary_loss_mlp": 0.01288448, + "balance_loss_clip": 1.14401722, + "balance_loss_mlp": 1.03534257, + "epoch": 0.5503517104551193, + "flos": 24312382837920.0, + "grad_norm": 3.0119648411923703, + "language_loss": 0.7077353, + "learning_rate": 1.7714824980081721e-06, + "loss": 0.73569, + "num_input_tokens_seen": 98708915, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.52929688, + "step": 4577, + "time_per_iteration": 3.08215594291687 + }, + { + "auxiliary_loss_clip": 0.0151427, + "auxiliary_loss_mlp": 0.01273162, + "balance_loss_clip": 1.15082121, + "balance_loss_mlp": 1.02101099, + "epoch": 0.5504719533457584, + "flos": 22421645117280.0, + "grad_norm": 1.9200394201331108, + "language_loss": 0.73961663, + "learning_rate": 1.7707086451797276e-06, + "loss": 0.76749092, + "num_input_tokens_seen": 98729790, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.51953125, + "step": 4578, + "time_per_iteration": 3.195934295654297 + }, + { + "auxiliary_loss_clip": 0.01511738, + "auxiliary_loss_mlp": 0.01199593, + "balance_loss_clip": 1.15232337, + "balance_loss_mlp": 0.9966507, + "epoch": 0.5505921962363975, + "flos": 67300777827360.0, + "grad_norm": 0.699499020481412, + "language_loss": 0.52307945, + "learning_rate": 1.7699348271345993e-06, + "loss": 0.55019277, + "num_input_tokens_seen": 98792415, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.03125, + "step": 4579, + "time_per_iteration": 3.501343250274658 + }, + { + "auxiliary_loss_clip": 0.01512299, + "auxiliary_loss_mlp": 0.01203476, + "balance_loss_clip": 1.15313852, + "balance_loss_mlp": 1.00053406, + "epoch": 0.5507124391270366, + "flos": 45690006857760.0, + "grad_norm": 0.7124264648888429, + "language_loss": 0.54396397, + "learning_rate": 1.7691610439901753e-06, + "loss": 0.57112175, + "num_input_tokens_seen": 98855350, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.03125, + "step": 4580, + "time_per_iteration": 3.49465274810791 + }, + { + "auxiliary_loss_clip": 0.01508902, + "auxiliary_loss_mlp": 0.01274493, + "balance_loss_clip": 1.14733934, + "balance_loss_mlp": 1.02386749, + "epoch": 0.5508326820176757, + "flos": 22276075381920.0, + "grad_norm": 2.0319903043259675, + "language_loss": 0.7553612, + "learning_rate": 1.7683872958638367e-06, + "loss": 0.78319514, + "num_input_tokens_seen": 98874230, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.50390625, + "step": 4581, + "time_per_iteration": 3.045954704284668 + }, + { + "auxiliary_loss_clip": 0.0150462, + "auxiliary_loss_mlp": 0.01268532, + "balance_loss_clip": 1.14265871, + "balance_loss_mlp": 1.01866961, + "epoch": 0.5509529249083148, + "flos": 20014586254080.0, + "grad_norm": 3.024396771229834, + "language_loss": 0.84544575, + "learning_rate": 1.7676135828729614e-06, + "loss": 0.87317729, + "num_input_tokens_seen": 98893940, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.49609375, + "step": 4582, + "time_per_iteration": 3.020251512527466 + }, + { + "auxiliary_loss_clip": 0.01507644, + "auxiliary_loss_mlp": 0.01276102, + "balance_loss_clip": 1.14468277, + "balance_loss_mlp": 1.02662086, + "epoch": 0.5510731677989539, + "flos": 21836938773600.0, + "grad_norm": 2.175166096021828, + "language_loss": 0.834674, + "learning_rate": 1.7668399051349205e-06, + "loss": 0.86251152, + "num_input_tokens_seen": 98913620, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.4921875, + "step": 4583, + "time_per_iteration": 3.0698153972625732 + }, + { + "auxiliary_loss_clip": 0.01517281, + "auxiliary_loss_mlp": 0.01279906, + "balance_loss_clip": 1.15638065, + "balance_loss_mlp": 1.02889872, + "epoch": 0.5511934106895929, + "flos": 21469904326080.0, + "grad_norm": 2.0922034341111813, + "language_loss": 0.83120042, + "learning_rate": 1.766066262767081e-06, + "loss": 0.85917234, + "num_input_tokens_seen": 98931460, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.5078125, + "step": 4584, + "time_per_iteration": 3.0522546768188477 + }, + { + "auxiliary_loss_clip": 0.0150701, + "auxiliary_loss_mlp": 0.01264337, + "balance_loss_clip": 1.14388609, + "balance_loss_mlp": 1.01599979, + "epoch": 0.5513136535802321, + "flos": 21071047788000.0, + "grad_norm": 2.4618740486018225, + "language_loss": 0.77152932, + "learning_rate": 1.765292655886803e-06, + "loss": 0.79924279, + "num_input_tokens_seen": 98950105, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.48046875, + "step": 4585, + "time_per_iteration": 3.013397216796875 + }, + { + "auxiliary_loss_clip": 0.01513359, + "auxiliary_loss_mlp": 0.0127435, + "balance_loss_clip": 1.15196097, + "balance_loss_mlp": 1.02410543, + "epoch": 0.5514338964708712, + "flos": 27817283221920.0, + "grad_norm": 4.050139033644261, + "language_loss": 0.70900369, + "learning_rate": 1.764519084611443e-06, + "loss": 0.73688078, + "num_input_tokens_seen": 98970560, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.5, + "step": 4586, + "time_per_iteration": 3.052567958831787 + }, + { + "auxiliary_loss_clip": 0.01515163, + "auxiliary_loss_mlp": 0.01277957, + "balance_loss_clip": 1.15326166, + "balance_loss_mlp": 1.02447093, + "epoch": 0.5515541393615102, + "flos": 21910709773440.0, + "grad_norm": 5.206772624608187, + "language_loss": 0.77785939, + "learning_rate": 1.7637455490583505e-06, + "loss": 0.8057906, + "num_input_tokens_seen": 98989885, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.53320312, + "step": 4587, + "time_per_iteration": 2.9383835792541504 + }, + { + "auxiliary_loss_clip": 0.01508713, + "auxiliary_loss_mlp": 0.01277446, + "balance_loss_clip": 1.14611924, + "balance_loss_mlp": 1.02548528, + "epoch": 0.5516743822521494, + "flos": 20487517217280.0, + "grad_norm": 2.1890762727158264, + "language_loss": 0.77566731, + "learning_rate": 1.7629720493448701e-06, + "loss": 0.8035289, + "num_input_tokens_seen": 99007180, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.51757812, + "step": 4588, + "time_per_iteration": 3.0316905975341797 + }, + { + "auxiliary_loss_clip": 0.01512845, + "auxiliary_loss_mlp": 0.01273461, + "balance_loss_clip": 1.1507051, + "balance_loss_mlp": 1.02283549, + "epoch": 0.5517946251427884, + "flos": 14942213136000.0, + "grad_norm": 1.6709163863660388, + "language_loss": 0.84956843, + "learning_rate": 1.7621985855883418e-06, + "loss": 0.87743145, + "num_input_tokens_seen": 99023880, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.50390625, + "step": 4589, + "time_per_iteration": 3.0383360385894775 + }, + { + "auxiliary_loss_clip": 0.01509301, + "auxiliary_loss_mlp": 0.01274846, + "balance_loss_clip": 1.14778805, + "balance_loss_mlp": 1.02441144, + "epoch": 0.5519148680334275, + "flos": 18406795521600.0, + "grad_norm": 1.8079207306551055, + "language_loss": 0.72466058, + "learning_rate": 1.7614251579060983e-06, + "loss": 0.75250208, + "num_input_tokens_seen": 99042475, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.50195312, + "step": 4590, + "time_per_iteration": 2.985785961151123 + }, + { + "auxiliary_loss_clip": 0.01512984, + "auxiliary_loss_mlp": 0.0127294, + "balance_loss_clip": 1.1532495, + "balance_loss_mlp": 1.02078867, + "epoch": 0.5520351109240667, + "flos": 25115367928320.0, + "grad_norm": 4.686796946966674, + "language_loss": 0.84432876, + "learning_rate": 1.76065176641547e-06, + "loss": 0.87218803, + "num_input_tokens_seen": 99065185, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.51953125, + "step": 4591, + "time_per_iteration": 3.037114381790161 + }, + { + "auxiliary_loss_clip": 0.01508514, + "auxiliary_loss_mlp": 0.0127465, + "balance_loss_clip": 1.14708388, + "balance_loss_mlp": 1.02535939, + "epoch": 0.5521553538147057, + "flos": 21071692566720.0, + "grad_norm": 1.8410350693874558, + "language_loss": 0.78001463, + "learning_rate": 1.759878411233777e-06, + "loss": 0.80784631, + "num_input_tokens_seen": 99083645, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.49023438, + "step": 4592, + "time_per_iteration": 2.9261555671691895 + }, + { + "auxiliary_loss_clip": 0.01510232, + "auxiliary_loss_mlp": 0.01285645, + "balance_loss_clip": 1.14772058, + "balance_loss_mlp": 1.03654528, + "epoch": 0.5522755967053448, + "flos": 18882002174400.0, + "grad_norm": 2.43410965603599, + "language_loss": 0.7574113, + "learning_rate": 1.7591050924783388e-06, + "loss": 0.78537011, + "num_input_tokens_seen": 99100835, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.48828125, + "step": 4593, + "time_per_iteration": 3.0269510746002197 + }, + { + "auxiliary_loss_clip": 0.01502144, + "auxiliary_loss_mlp": 0.01225723, + "balance_loss_clip": 1.14570332, + "balance_loss_mlp": 1.02201843, + "epoch": 0.5523958395959839, + "flos": 64682570711520.0, + "grad_norm": 0.845480122633803, + "language_loss": 0.57947004, + "learning_rate": 1.7583318102664661e-06, + "loss": 0.6067487, + "num_input_tokens_seen": 99168400, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0390625, + "step": 4594, + "time_per_iteration": 3.53060245513916 + }, + { + "auxiliary_loss_clip": 0.01503651, + "auxiliary_loss_mlp": 0.01278805, + "balance_loss_clip": 1.14132369, + "balance_loss_mlp": 1.02798843, + "epoch": 0.552516082486623, + "flos": 10891293495840.0, + "grad_norm": 2.2205076516918307, + "language_loss": 0.79264492, + "learning_rate": 1.757558564715466e-06, + "loss": 0.82046944, + "num_input_tokens_seen": 99186475, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.50585938, + "step": 4595, + "time_per_iteration": 3.018239736557007 + }, + { + "auxiliary_loss_clip": 0.01515702, + "auxiliary_loss_mlp": 0.01279446, + "balance_loss_clip": 1.15551829, + "balance_loss_mlp": 1.02653193, + "epoch": 0.552636325377262, + "flos": 22201963028640.0, + "grad_norm": 3.212917217171501, + "language_loss": 0.74246919, + "learning_rate": 1.7567853559426386e-06, + "loss": 0.77042067, + "num_input_tokens_seen": 99203525, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.52734375, + "step": 4596, + "time_per_iteration": 3.977609395980835 + }, + { + "auxiliary_loss_clip": 0.01514135, + "auxiliary_loss_mlp": 0.01272886, + "balance_loss_clip": 1.15444815, + "balance_loss_mlp": 1.02302313, + "epoch": 0.5527565682679012, + "flos": 23990483265120.0, + "grad_norm": 2.350141041935525, + "language_loss": 0.75220025, + "learning_rate": 1.7560121840652797e-06, + "loss": 0.78007042, + "num_input_tokens_seen": 99222910, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.49609375, + "step": 4597, + "time_per_iteration": 3.9438071250915527 + }, + { + "auxiliary_loss_clip": 0.01521529, + "auxiliary_loss_mlp": 0.01271233, + "balance_loss_clip": 1.15972638, + "balance_loss_mlp": 1.0217514, + "epoch": 0.5528768111585403, + "flos": 19721322806400.0, + "grad_norm": 3.625659584823273, + "language_loss": 0.6900323, + "learning_rate": 1.7552390492006782e-06, + "loss": 0.71795994, + "num_input_tokens_seen": 99241230, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.4921875, + "step": 4598, + "time_per_iteration": 3.0215933322906494 + }, + { + "auxiliary_loss_clip": 0.015162, + "auxiliary_loss_mlp": 0.01285366, + "balance_loss_clip": 1.15419197, + "balance_loss_mlp": 1.03435862, + "epoch": 0.5529970540491793, + "flos": 26218519391520.0, + "grad_norm": 2.05170102416423, + "language_loss": 0.65648693, + "learning_rate": 1.7544659514661184e-06, + "loss": 0.6845026, + "num_input_tokens_seen": 99264320, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.5078125, + "step": 4599, + "time_per_iteration": 3.1238248348236084 + }, + { + "auxiliary_loss_clip": 0.01514405, + "auxiliary_loss_mlp": 0.01268091, + "balance_loss_clip": 1.15222788, + "balance_loss_mlp": 1.01803744, + "epoch": 0.5531172969398185, + "flos": 24428292387840.0, + "grad_norm": 2.2598385004404418, + "language_loss": 0.797804, + "learning_rate": 1.7536928909788786e-06, + "loss": 0.82562894, + "num_input_tokens_seen": 99283625, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.49804688, + "step": 4600, + "time_per_iteration": 3.956829786300659 + }, + { + "auxiliary_loss_clip": 0.01496456, + "auxiliary_loss_mlp": 0.01201569, + "balance_loss_clip": 1.14001489, + "balance_loss_mlp": 0.99786377, + "epoch": 0.5532375398304575, + "flos": 64912948905600.0, + "grad_norm": 0.8828118245789564, + "language_loss": 0.61916691, + "learning_rate": 1.752919867856231e-06, + "loss": 0.64614719, + "num_input_tokens_seen": 99335270, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0390625, + "step": 4601, + "time_per_iteration": 3.3182342052459717 + }, + { + "auxiliary_loss_clip": 0.01507348, + "auxiliary_loss_mlp": 0.01268911, + "balance_loss_clip": 1.14517236, + "balance_loss_mlp": 1.02019274, + "epoch": 0.5533577827210966, + "flos": 19685176905600.0, + "grad_norm": 1.8148334226531437, + "language_loss": 0.79204756, + "learning_rate": 1.7521468822154436e-06, + "loss": 0.81981015, + "num_input_tokens_seen": 99354185, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.484375, + "step": 4602, + "time_per_iteration": 3.0413992404937744 + }, + { + "auxiliary_loss_clip": 0.01507705, + "auxiliary_loss_mlp": 0.01270873, + "balance_loss_clip": 1.14519, + "balance_loss_mlp": 1.02348948, + "epoch": 0.5534780256117358, + "flos": 32309235878400.0, + "grad_norm": 2.6599894021894457, + "language_loss": 0.7519663, + "learning_rate": 1.751373934173777e-06, + "loss": 0.77975202, + "num_input_tokens_seen": 99376930, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.47070312, + "step": 4603, + "time_per_iteration": 3.943411350250244 + }, + { + "auxiliary_loss_clip": 0.01510167, + "auxiliary_loss_mlp": 0.01268432, + "balance_loss_clip": 1.14802194, + "balance_loss_mlp": 1.01666188, + "epoch": 0.5535982685023748, + "flos": 23224706064000.0, + "grad_norm": 1.7310848365253877, + "language_loss": 0.73021919, + "learning_rate": 1.750601023848487e-06, + "loss": 0.7580052, + "num_input_tokens_seen": 99397655, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.515625, + "step": 4604, + "time_per_iteration": 3.0855486392974854 + }, + { + "auxiliary_loss_clip": 0.0150986, + "auxiliary_loss_mlp": 0.01270452, + "balance_loss_clip": 1.14705753, + "balance_loss_mlp": 1.02154279, + "epoch": 0.5537185113930139, + "flos": 24354824813280.0, + "grad_norm": 2.0929733242946176, + "language_loss": 0.73968297, + "learning_rate": 1.749828151356823e-06, + "loss": 0.7674861, + "num_input_tokens_seen": 99417850, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.48632812, + "step": 4605, + "time_per_iteration": 3.0361876487731934 + }, + { + "auxiliary_loss_clip": 0.01513499, + "auxiliary_loss_mlp": 0.01274116, + "balance_loss_clip": 1.15244007, + "balance_loss_mlp": 1.02501607, + "epoch": 0.553838754283653, + "flos": 23551270800480.0, + "grad_norm": 2.0726138750371272, + "language_loss": 0.75795043, + "learning_rate": 1.7490553168160297e-06, + "loss": 0.78582656, + "num_input_tokens_seen": 99438920, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.48828125, + "step": 4606, + "time_per_iteration": 2.9966650009155273 + }, + { + "auxiliary_loss_clip": 0.0150659, + "auxiliary_loss_mlp": 0.01280296, + "balance_loss_clip": 1.1457634, + "balance_loss_mlp": 1.03062403, + "epoch": 0.5539589971742921, + "flos": 17276980197600.0, + "grad_norm": 3.171380002262815, + "language_loss": 0.76743752, + "learning_rate": 1.748282520343345e-06, + "loss": 0.79530632, + "num_input_tokens_seen": 99457950, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.49414062, + "step": 4607, + "time_per_iteration": 2.9759175777435303 + }, + { + "auxiliary_loss_clip": 0.0151798, + "auxiliary_loss_mlp": 0.01284878, + "balance_loss_clip": 1.15544236, + "balance_loss_mlp": 1.03196335, + "epoch": 0.5540792400649311, + "flos": 27566689318560.0, + "grad_norm": 1.7817638633592106, + "language_loss": 0.78855991, + "learning_rate": 1.7475097620560023e-06, + "loss": 0.8165884, + "num_input_tokens_seen": 99478015, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.52734375, + "step": 4608, + "time_per_iteration": 3.0282187461853027 + }, + { + "auxiliary_loss_clip": 0.01513894, + "auxiliary_loss_mlp": 0.01281211, + "balance_loss_clip": 1.15338242, + "balance_loss_mlp": 1.03287435, + "epoch": 0.5541994829555702, + "flos": 23880945646080.0, + "grad_norm": 2.0288725042784455, + "language_loss": 0.71004415, + "learning_rate": 1.746737042071228e-06, + "loss": 0.73799515, + "num_input_tokens_seen": 99496520, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.48046875, + "step": 4609, + "time_per_iteration": 2.998063325881958 + }, + { + "auxiliary_loss_clip": 0.01506328, + "auxiliary_loss_mlp": 0.01270392, + "balance_loss_clip": 1.14229441, + "balance_loss_mlp": 1.02243662, + "epoch": 0.5543197258462094, + "flos": 20117524373280.0, + "grad_norm": 1.9947740825793925, + "language_loss": 0.78835166, + "learning_rate": 1.7459643605062424e-06, + "loss": 0.81611884, + "num_input_tokens_seen": 99513780, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.4765625, + "step": 4610, + "time_per_iteration": 2.9684829711914062 + }, + { + "auxiliary_loss_clip": 0.01509523, + "auxiliary_loss_mlp": 0.01271078, + "balance_loss_clip": 1.14713216, + "balance_loss_mlp": 1.01988006, + "epoch": 0.5544399687368484, + "flos": 20918309630400.0, + "grad_norm": 4.590268969733785, + "language_loss": 0.80732238, + "learning_rate": 1.745191717478262e-06, + "loss": 0.83512843, + "num_input_tokens_seen": 99532360, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.50976562, + "step": 4611, + "time_per_iteration": 3.166224718093872 + }, + { + "auxiliary_loss_clip": 0.01506371, + "auxiliary_loss_mlp": 0.01276962, + "balance_loss_clip": 1.14289904, + "balance_loss_mlp": 1.02824402, + "epoch": 0.5545602116274875, + "flos": 25520937750720.0, + "grad_norm": 1.9324760093218458, + "language_loss": 0.79750448, + "learning_rate": 1.7444191131044948e-06, + "loss": 0.82533783, + "num_input_tokens_seen": 99552635, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.484375, + "step": 4612, + "time_per_iteration": 3.095496892929077 + }, + { + "auxiliary_loss_clip": 0.01508971, + "auxiliary_loss_mlp": 0.0127077, + "balance_loss_clip": 1.14674568, + "balance_loss_mlp": 1.02052569, + "epoch": 0.5546804545181266, + "flos": 20997656069760.0, + "grad_norm": 1.777515907586385, + "language_loss": 0.72966373, + "learning_rate": 1.7436465475021456e-06, + "loss": 0.75746119, + "num_input_tokens_seen": 99572685, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.5, + "step": 4613, + "time_per_iteration": 3.031658172607422 + }, + { + "auxiliary_loss_clip": 0.01503131, + "auxiliary_loss_mlp": 0.01267623, + "balance_loss_clip": 1.13943696, + "balance_loss_mlp": 1.02023959, + "epoch": 0.5548006974087657, + "flos": 26836071886080.0, + "grad_norm": 4.005712723389524, + "language_loss": 0.7128076, + "learning_rate": 1.7428740207884111e-06, + "loss": 0.74051511, + "num_input_tokens_seen": 99593565, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.47070312, + "step": 4614, + "time_per_iteration": 3.0749282836914062 + }, + { + "auxiliary_loss_clip": 0.01512409, + "auxiliary_loss_mlp": 0.0127351, + "balance_loss_clip": 1.14993834, + "balance_loss_mlp": 1.02212179, + "epoch": 0.5549209402994048, + "flos": 33659226357120.0, + "grad_norm": 1.9908935488936879, + "language_loss": 0.6111927, + "learning_rate": 1.7421015330804833e-06, + "loss": 0.63905191, + "num_input_tokens_seen": 99613485, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.51171875, + "step": 4615, + "time_per_iteration": 3.096301317214966 + }, + { + "auxiliary_loss_clip": 0.01504649, + "auxiliary_loss_mlp": 0.01275105, + "balance_loss_clip": 1.14275789, + "balance_loss_mlp": 1.02657747, + "epoch": 0.5550411831900439, + "flos": 23771825236800.0, + "grad_norm": 2.2327914367694297, + "language_loss": 0.72787637, + "learning_rate": 1.7413290844955475e-06, + "loss": 0.75567389, + "num_input_tokens_seen": 99633515, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.48242188, + "step": 4616, + "time_per_iteration": 3.0537543296813965 + }, + { + "auxiliary_loss_clip": 0.01513652, + "auxiliary_loss_mlp": 0.01285672, + "balance_loss_clip": 1.1504395, + "balance_loss_mlp": 1.03485608, + "epoch": 0.555161426080683, + "flos": 21653250873120.0, + "grad_norm": 2.3303612782744727, + "language_loss": 0.78691894, + "learning_rate": 1.7405566751507843e-06, + "loss": 0.8149122, + "num_input_tokens_seen": 99651560, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.50585938, + "step": 4617, + "time_per_iteration": 2.9979982376098633 + }, + { + "auxiliary_loss_clip": 0.0150118, + "auxiliary_loss_mlp": 0.01268867, + "balance_loss_clip": 1.1384871, + "balance_loss_mlp": 1.02110291, + "epoch": 0.555281668971322, + "flos": 49566758929920.0, + "grad_norm": 38.22502943425175, + "language_loss": 0.67804277, + "learning_rate": 1.7397843051633668e-06, + "loss": 0.70574325, + "num_input_tokens_seen": 99674255, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.47460938, + "step": 4618, + "time_per_iteration": 3.2240281105041504 + }, + { + "auxiliary_loss_clip": 0.01502031, + "auxiliary_loss_mlp": 0.01272702, + "balance_loss_clip": 1.13732696, + "balance_loss_mlp": 1.02283978, + "epoch": 0.5554019118619612, + "flos": 20743762416480.0, + "grad_norm": 1.8784528645151515, + "language_loss": 0.71241409, + "learning_rate": 1.739011974650464e-06, + "loss": 0.74016142, + "num_input_tokens_seen": 99693585, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.49609375, + "step": 4619, + "time_per_iteration": 3.0285775661468506 + }, + { + "auxiliary_loss_clip": 0.01502664, + "auxiliary_loss_mlp": 0.01285835, + "balance_loss_clip": 1.13818932, + "balance_loss_mlp": 1.03120399, + "epoch": 0.5555221547526003, + "flos": 25485550413120.0, + "grad_norm": 2.0646402396104158, + "language_loss": 0.76637745, + "learning_rate": 1.7382396837292365e-06, + "loss": 0.79426247, + "num_input_tokens_seen": 99714045, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.54492188, + "step": 4620, + "time_per_iteration": 3.069915533065796 + }, + { + "auxiliary_loss_clip": 0.01507059, + "auxiliary_loss_mlp": 0.01273917, + "balance_loss_clip": 1.14225435, + "balance_loss_mlp": 1.02386403, + "epoch": 0.5556423976432393, + "flos": 21764684900160.0, + "grad_norm": 1.8151468889222564, + "language_loss": 0.73470008, + "learning_rate": 1.737467432516841e-06, + "loss": 0.76250982, + "num_input_tokens_seen": 99734145, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.49804688, + "step": 4621, + "time_per_iteration": 3.116776704788208 + }, + { + "auxiliary_loss_clip": 0.01506205, + "auxiliary_loss_mlp": 0.01270345, + "balance_loss_clip": 1.1410234, + "balance_loss_mlp": 1.02048266, + "epoch": 0.5557626405338785, + "flos": 24902512908480.0, + "grad_norm": 2.606525364708524, + "language_loss": 0.74445283, + "learning_rate": 1.7366952211304274e-06, + "loss": 0.77221835, + "num_input_tokens_seen": 99751990, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.49609375, + "step": 4622, + "time_per_iteration": 3.2297985553741455 + }, + { + "auxiliary_loss_clip": 0.01502584, + "auxiliary_loss_mlp": 0.01278899, + "balance_loss_clip": 1.13824534, + "balance_loss_mlp": 1.03094363, + "epoch": 0.5558828834245175, + "flos": 18699490046880.0, + "grad_norm": 2.4804336802328826, + "language_loss": 0.83561265, + "learning_rate": 1.735923049687139e-06, + "loss": 0.8634274, + "num_input_tokens_seen": 99768565, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.4765625, + "step": 4623, + "time_per_iteration": 3.877934455871582 + }, + { + "auxiliary_loss_clip": 0.01506099, + "auxiliary_loss_mlp": 0.01272241, + "balance_loss_clip": 1.14261365, + "balance_loss_mlp": 1.02256894, + "epoch": 0.5560031263151566, + "flos": 27274677500160.0, + "grad_norm": 1.9260378159020164, + "language_loss": 0.73797262, + "learning_rate": 1.7351509183041144e-06, + "loss": 0.76575601, + "num_input_tokens_seen": 99788895, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.49414062, + "step": 4624, + "time_per_iteration": 4.0296924114227295 + }, + { + "auxiliary_loss_clip": 0.01507846, + "auxiliary_loss_mlp": 0.01272528, + "balance_loss_clip": 1.14543056, + "balance_loss_mlp": 1.02361917, + "epoch": 0.5561233692057957, + "flos": 23405966562240.0, + "grad_norm": 1.8552096111560155, + "language_loss": 0.71469665, + "learning_rate": 1.7343788270984852e-06, + "loss": 0.74250042, + "num_input_tokens_seen": 99808035, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.48632812, + "step": 4625, + "time_per_iteration": 3.097454786300659 + }, + { + "auxiliary_loss_clip": 0.01508786, + "auxiliary_loss_mlp": 0.01276586, + "balance_loss_clip": 1.14497852, + "balance_loss_mlp": 1.02634203, + "epoch": 0.5562436120964348, + "flos": 37673962168320.0, + "grad_norm": 2.4476940565854917, + "language_loss": 0.74646544, + "learning_rate": 1.7336067761873764e-06, + "loss": 0.77431917, + "num_input_tokens_seen": 99830460, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.5, + "step": 4626, + "time_per_iteration": 3.089071035385132 + }, + { + "auxiliary_loss_clip": 0.01502995, + "auxiliary_loss_mlp": 0.01284097, + "balance_loss_clip": 1.1399473, + "balance_loss_mlp": 1.03156435, + "epoch": 0.5563638549870739, + "flos": 25157582334720.0, + "grad_norm": 1.9152366807822365, + "language_loss": 0.76574922, + "learning_rate": 1.7328347656879076e-06, + "loss": 0.79362011, + "num_input_tokens_seen": 99850320, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.5234375, + "step": 4627, + "time_per_iteration": 3.057095527648926 + }, + { + "auxiliary_loss_clip": 0.0150409, + "auxiliary_loss_mlp": 0.01277253, + "balance_loss_clip": 1.13832629, + "balance_loss_mlp": 1.02739, + "epoch": 0.556484097877713, + "flos": 13583081970720.0, + "grad_norm": 2.37118680821713, + "language_loss": 0.68342137, + "learning_rate": 1.7320627957171927e-06, + "loss": 0.71123481, + "num_input_tokens_seen": 99864980, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.49609375, + "step": 4628, + "time_per_iteration": 3.831406354904175 + }, + { + "auxiliary_loss_clip": 0.01512091, + "auxiliary_loss_mlp": 0.01287707, + "balance_loss_clip": 1.14924622, + "balance_loss_mlp": 1.03612781, + "epoch": 0.5566043407683521, + "flos": 24683703167520.0, + "grad_norm": 2.5237927912430096, + "language_loss": 0.81486738, + "learning_rate": 1.7312908663923382e-06, + "loss": 0.84286541, + "num_input_tokens_seen": 99881155, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.51367188, + "step": 4629, + "time_per_iteration": 2.9780445098876953 + }, + { + "auxiliary_loss_clip": 0.01503553, + "auxiliary_loss_mlp": 0.01279342, + "balance_loss_clip": 1.13753128, + "balance_loss_mlp": 1.02947974, + "epoch": 0.5567245836589911, + "flos": 20589886414080.0, + "grad_norm": 2.366962732899596, + "language_loss": 0.67464042, + "learning_rate": 1.7305189778304463e-06, + "loss": 0.70246935, + "num_input_tokens_seen": 99899330, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.49609375, + "step": 4630, + "time_per_iteration": 3.766112804412842 + }, + { + "auxiliary_loss_clip": 0.01509367, + "auxiliary_loss_mlp": 0.01280615, + "balance_loss_clip": 1.14518237, + "balance_loss_mlp": 1.03037107, + "epoch": 0.5568448265496303, + "flos": 20706289030080.0, + "grad_norm": 2.6053302972787735, + "language_loss": 0.80385429, + "learning_rate": 1.729747130148611e-06, + "loss": 0.83175409, + "num_input_tokens_seen": 99918525, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.5, + "step": 4631, + "time_per_iteration": 3.1798646450042725 + }, + { + "auxiliary_loss_clip": 0.01503896, + "auxiliary_loss_mlp": 0.01294328, + "balance_loss_clip": 1.13874841, + "balance_loss_mlp": 1.03950632, + "epoch": 0.5569650694402694, + "flos": 25305351903360.0, + "grad_norm": 2.90854441310049, + "language_loss": 0.77094352, + "learning_rate": 1.7289753234639208e-06, + "loss": 0.79892576, + "num_input_tokens_seen": 99937500, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.546875, + "step": 4632, + "time_per_iteration": 3.0820255279541016 + }, + { + "auxiliary_loss_clip": 0.01506486, + "auxiliary_loss_mlp": 0.01281043, + "balance_loss_clip": 1.14186144, + "balance_loss_mlp": 1.02870059, + "epoch": 0.5570853123309084, + "flos": 19714306096800.0, + "grad_norm": 1.8280635999713069, + "language_loss": 0.76369405, + "learning_rate": 1.7282035578934592e-06, + "loss": 0.79156935, + "num_input_tokens_seen": 99955665, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.52148438, + "step": 4633, + "time_per_iteration": 3.12457537651062 + }, + { + "auxiliary_loss_clip": 0.01504277, + "auxiliary_loss_mlp": 0.01278117, + "balance_loss_clip": 1.13825178, + "balance_loss_mlp": 1.02634692, + "epoch": 0.5572055552215476, + "flos": 16109994912480.0, + "grad_norm": 1.7628073870560517, + "language_loss": 0.78693676, + "learning_rate": 1.727431833554301e-06, + "loss": 0.81476068, + "num_input_tokens_seen": 99974140, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.515625, + "step": 4634, + "time_per_iteration": 3.0879690647125244 + }, + { + "auxiliary_loss_clip": 0.01506197, + "auxiliary_loss_mlp": 0.01283405, + "balance_loss_clip": 1.14154553, + "balance_loss_mlp": 1.03258896, + "epoch": 0.5573257981121866, + "flos": 17130727755360.0, + "grad_norm": 2.830005174204633, + "language_loss": 0.77636433, + "learning_rate": 1.7266601505635175e-06, + "loss": 0.80426031, + "num_input_tokens_seen": 99991480, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.50585938, + "step": 4635, + "time_per_iteration": 3.1244850158691406 + }, + { + "auxiliary_loss_clip": 0.01511487, + "auxiliary_loss_mlp": 0.01270106, + "balance_loss_clip": 1.14614356, + "balance_loss_mlp": 1.02234125, + "epoch": 0.5574460410028257, + "flos": 18809027665920.0, + "grad_norm": 3.8161742278378052, + "language_loss": 0.75923944, + "learning_rate": 1.7258885090381717e-06, + "loss": 0.78705543, + "num_input_tokens_seen": 100009520, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.47460938, + "step": 4636, + "time_per_iteration": 3.0298914909362793 + }, + { + "auxiliary_loss_clip": 0.01503209, + "auxiliary_loss_mlp": 0.01274697, + "balance_loss_clip": 1.13838446, + "balance_loss_mlp": 1.02769518, + "epoch": 0.5575662838934649, + "flos": 29645249109120.0, + "grad_norm": 1.8461962363329574, + "language_loss": 0.78823698, + "learning_rate": 1.7251169090953213e-06, + "loss": 0.81601602, + "num_input_tokens_seen": 100029995, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.46679688, + "step": 4637, + "time_per_iteration": 3.0474700927734375 + }, + { + "auxiliary_loss_clip": 0.01500994, + "auxiliary_loss_mlp": 0.01276898, + "balance_loss_clip": 1.13559747, + "balance_loss_mlp": 1.0268445, + "epoch": 0.5576865267841039, + "flos": 22056658790400.0, + "grad_norm": 3.8139973098599125, + "language_loss": 0.76993895, + "learning_rate": 1.7243453508520168e-06, + "loss": 0.79771781, + "num_input_tokens_seen": 100046980, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.49804688, + "step": 4638, + "time_per_iteration": 3.0550148487091064 + }, + { + "auxiliary_loss_clip": 0.01506732, + "auxiliary_loss_mlp": 0.01280011, + "balance_loss_clip": 1.14303446, + "balance_loss_mlp": 1.0282414, + "epoch": 0.557806769674743, + "flos": 17197633758240.0, + "grad_norm": 2.2647690362394934, + "language_loss": 0.84279126, + "learning_rate": 1.7235738344253038e-06, + "loss": 0.8706587, + "num_input_tokens_seen": 100060610, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.515625, + "step": 4639, + "time_per_iteration": 3.013995409011841 + }, + { + "auxiliary_loss_clip": 0.01508635, + "auxiliary_loss_mlp": 0.01269436, + "balance_loss_clip": 1.14472032, + "balance_loss_mlp": 1.01823854, + "epoch": 0.557927012565382, + "flos": 24714766694880.0, + "grad_norm": 1.8085370750186043, + "language_loss": 0.82720089, + "learning_rate": 1.72280235993222e-06, + "loss": 0.8549816, + "num_input_tokens_seen": 100078915, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.50976562, + "step": 4640, + "time_per_iteration": 3.0078694820404053 + }, + { + "auxiliary_loss_clip": 0.01507464, + "auxiliary_loss_mlp": 0.01273203, + "balance_loss_clip": 1.1425755, + "balance_loss_mlp": 1.02219582, + "epoch": 0.5580472554560212, + "flos": 16985006307360.0, + "grad_norm": 5.798296258906449, + "language_loss": 0.69599068, + "learning_rate": 1.722030927489798e-06, + "loss": 0.72379732, + "num_input_tokens_seen": 100096195, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.5078125, + "step": 4641, + "time_per_iteration": 3.0323526859283447 + }, + { + "auxiliary_loss_clip": 0.01511258, + "auxiliary_loss_mlp": 0.01267104, + "balance_loss_clip": 1.14674473, + "balance_loss_mlp": 1.01781392, + "epoch": 0.5581674983466602, + "flos": 23511104514720.0, + "grad_norm": 1.8886368160820013, + "language_loss": 0.74402446, + "learning_rate": 1.7212595372150634e-06, + "loss": 0.77180809, + "num_input_tokens_seen": 100116175, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.49023438, + "step": 4642, + "time_per_iteration": 2.990156650543213 + }, + { + "auxiliary_loss_clip": 0.01512815, + "auxiliary_loss_mlp": 0.01271119, + "balance_loss_clip": 1.14728796, + "balance_loss_mlp": 1.0246892, + "epoch": 0.5582877412372993, + "flos": 13482002331360.0, + "grad_norm": 2.203060369456348, + "language_loss": 0.72906137, + "learning_rate": 1.720488189225035e-06, + "loss": 0.75690067, + "num_input_tokens_seen": 100133875, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.4609375, + "step": 4643, + "time_per_iteration": 3.0193371772766113 + }, + { + "auxiliary_loss_clip": 0.0150854, + "auxiliary_loss_mlp": 0.01266661, + "balance_loss_clip": 1.14325261, + "balance_loss_mlp": 1.01775169, + "epoch": 0.5584079841279385, + "flos": 21905513615520.0, + "grad_norm": 2.5467778339581075, + "language_loss": 0.79285896, + "learning_rate": 1.7197168836367265e-06, + "loss": 0.82061094, + "num_input_tokens_seen": 100150685, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.48632812, + "step": 4644, + "time_per_iteration": 3.0459694862365723 + }, + { + "auxiliary_loss_clip": 0.01506676, + "auxiliary_loss_mlp": 0.01271995, + "balance_loss_clip": 1.13988757, + "balance_loss_mlp": 1.02651906, + "epoch": 0.5585282270185775, + "flos": 18845552848320.0, + "grad_norm": 1.9834275052886547, + "language_loss": 0.8169601, + "learning_rate": 1.7189456205671433e-06, + "loss": 0.84474671, + "num_input_tokens_seen": 100169530, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.45117188, + "step": 4645, + "time_per_iteration": 2.9810991287231445 + }, + { + "auxiliary_loss_clip": 0.01507032, + "auxiliary_loss_mlp": 0.01278253, + "balance_loss_clip": 1.14098048, + "balance_loss_mlp": 1.02800906, + "epoch": 0.5586484699092166, + "flos": 21870429703200.0, + "grad_norm": 2.057521446257884, + "language_loss": 0.82562661, + "learning_rate": 1.7181744001332866e-06, + "loss": 0.8534795, + "num_input_tokens_seen": 100188140, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.5, + "step": 4646, + "time_per_iteration": 3.0699710845947266 + }, + { + "auxiliary_loss_clip": 0.0151357, + "auxiliary_loss_mlp": 0.01273149, + "balance_loss_clip": 1.14750874, + "balance_loss_mlp": 1.02385807, + "epoch": 0.5587687127998557, + "flos": 22895448428160.0, + "grad_norm": 1.96691263569134, + "language_loss": 0.63524377, + "learning_rate": 1.7174032224521493e-06, + "loss": 0.66311097, + "num_input_tokens_seen": 100206850, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.49023438, + "step": 4647, + "time_per_iteration": 2.993063449859619 + }, + { + "auxiliary_loss_clip": 0.01503562, + "auxiliary_loss_mlp": 0.01276416, + "balance_loss_clip": 1.13899672, + "balance_loss_mlp": 1.02731633, + "epoch": 0.5588889556904948, + "flos": 20305156802400.0, + "grad_norm": 2.6296587780543352, + "language_loss": 0.69856864, + "learning_rate": 1.7166320876407184e-06, + "loss": 0.72636843, + "num_input_tokens_seen": 100226270, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.48828125, + "step": 4648, + "time_per_iteration": 3.0432770252227783 + }, + { + "auxiliary_loss_clip": 0.015083, + "auxiliary_loss_mlp": 0.01286378, + "balance_loss_clip": 1.14262104, + "balance_loss_mlp": 1.03479838, + "epoch": 0.5590091985811338, + "flos": 16474033035360.0, + "grad_norm": 1.918549605298823, + "language_loss": 0.67826921, + "learning_rate": 1.7158609958159742e-06, + "loss": 0.70621598, + "num_input_tokens_seen": 100243675, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.51367188, + "step": 4649, + "time_per_iteration": 3.099273204803467 + }, + { + "auxiliary_loss_clip": 0.01509113, + "auxiliary_loss_mlp": 0.01281995, + "balance_loss_clip": 1.14387667, + "balance_loss_mlp": 1.03136945, + "epoch": 0.559129441471773, + "flos": 14533609060800.0, + "grad_norm": 2.50836858115104, + "language_loss": 0.78438658, + "learning_rate": 1.7150899470948911e-06, + "loss": 0.8122977, + "num_input_tokens_seen": 100258940, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.50390625, + "step": 4650, + "time_per_iteration": 2.967822551727295 + }, + { + "auxiliary_loss_clip": 0.01490299, + "auxiliary_loss_mlp": 0.01211449, + "balance_loss_clip": 1.12708378, + "balance_loss_mlp": 1.01003265, + "epoch": 0.5592496843624121, + "flos": 60527992317120.0, + "grad_norm": 0.802036280003759, + "language_loss": 0.56566799, + "learning_rate": 1.7143189415944365e-06, + "loss": 0.59268534, + "num_input_tokens_seen": 100323400, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.015625, + "step": 4651, + "time_per_iteration": 5.206156015396118 + }, + { + "auxiliary_loss_clip": 0.01507559, + "auxiliary_loss_mlp": 0.0128019, + "balance_loss_clip": 1.14224482, + "balance_loss_mlp": 1.03032732, + "epoch": 0.5593699272530511, + "flos": 20888346019680.0, + "grad_norm": 1.919780490816655, + "language_loss": 0.76584458, + "learning_rate": 1.7135479794315714e-06, + "loss": 0.79372203, + "num_input_tokens_seen": 100340355, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.49609375, + "step": 4652, + "time_per_iteration": 3.084230422973633 + }, + { + "auxiliary_loss_clip": 0.01506235, + "auxiliary_loss_mlp": 0.01267189, + "balance_loss_clip": 1.14117682, + "balance_loss_mlp": 1.02095032, + "epoch": 0.5594901701436903, + "flos": 12898433832480.0, + "grad_norm": 1.8308625068897326, + "language_loss": 0.78966188, + "learning_rate": 1.7127770607232502e-06, + "loss": 0.8173961, + "num_input_tokens_seen": 100358900, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.45898438, + "step": 4653, + "time_per_iteration": 3.1469109058380127 + }, + { + "auxiliary_loss_clip": 0.01502812, + "auxiliary_loss_mlp": 0.01279666, + "balance_loss_clip": 1.13741374, + "balance_loss_mlp": 1.03018463, + "epoch": 0.5596104130343293, + "flos": 23114334025440.0, + "grad_norm": 2.311556839318325, + "language_loss": 0.79605192, + "learning_rate": 1.7120061855864204e-06, + "loss": 0.82387668, + "num_input_tokens_seen": 100378910, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.4921875, + "step": 4654, + "time_per_iteration": 3.041111469268799 + }, + { + "auxiliary_loss_clip": 0.01509044, + "auxiliary_loss_mlp": 0.01278123, + "balance_loss_clip": 1.14576674, + "balance_loss_mlp": 1.02749801, + "epoch": 0.5597306559249684, + "flos": 25960188143520.0, + "grad_norm": 2.70012388439772, + "language_loss": 0.71358699, + "learning_rate": 1.7112353541380233e-06, + "loss": 0.74145865, + "num_input_tokens_seen": 100398770, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.50390625, + "step": 4655, + "time_per_iteration": 3.9738669395446777 + }, + { + "auxiliary_loss_clip": 0.01506515, + "auxiliary_loss_mlp": 0.01279308, + "balance_loss_clip": 1.14040995, + "balance_loss_mlp": 1.03039861, + "epoch": 0.5598508988156076, + "flos": 22494505841280.0, + "grad_norm": 1.9651685590618992, + "language_loss": 0.72059292, + "learning_rate": 1.7104645664949931e-06, + "loss": 0.74845111, + "num_input_tokens_seen": 100421240, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 2.48632812, + "step": 4656, + "time_per_iteration": 3.1553006172180176 + }, + { + "auxiliary_loss_clip": 0.01498902, + "auxiliary_loss_mlp": 0.01279019, + "balance_loss_clip": 1.13134718, + "balance_loss_mlp": 1.02915621, + "epoch": 0.5599711417062466, + "flos": 23114789163360.0, + "grad_norm": 3.412561790226277, + "language_loss": 0.7197113, + "learning_rate": 1.7096938227742584e-06, + "loss": 0.74749053, + "num_input_tokens_seen": 100442370, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.49609375, + "step": 4657, + "time_per_iteration": 3.033979892730713 + }, + { + "auxiliary_loss_clip": 0.01508684, + "auxiliary_loss_mlp": 0.01277785, + "balance_loss_clip": 1.14152312, + "balance_loss_mlp": 1.02811289, + "epoch": 0.5600913845968857, + "flos": 22341160833120.0, + "grad_norm": 2.993560489132428, + "language_loss": 0.84261101, + "learning_rate": 1.70892312309274e-06, + "loss": 0.87047577, + "num_input_tokens_seen": 100460260, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.49414062, + "step": 4658, + "time_per_iteration": 3.0441370010375977 + }, + { + "auxiliary_loss_clip": 0.01499666, + "auxiliary_loss_mlp": 0.01283014, + "balance_loss_clip": 1.1333847, + "balance_loss_mlp": 1.03353238, + "epoch": 0.5602116274875248, + "flos": 17635404952800.0, + "grad_norm": 2.0478937768580496, + "language_loss": 0.68452072, + "learning_rate": 1.7081524675673523e-06, + "loss": 0.71234751, + "num_input_tokens_seen": 100475750, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.4921875, + "step": 4659, + "time_per_iteration": 3.898690938949585 + }, + { + "auxiliary_loss_clip": 0.0149275, + "auxiliary_loss_mlp": 0.01206444, + "balance_loss_clip": 1.1283474, + "balance_loss_mlp": 1.00502777, + "epoch": 0.5603318703781639, + "flos": 70123419547200.0, + "grad_norm": 1.2882532841965253, + "language_loss": 0.59611809, + "learning_rate": 1.7073818563150026e-06, + "loss": 0.62311006, + "num_input_tokens_seen": 100537830, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.015625, + "step": 4660, + "time_per_iteration": 3.6118710041046143 + }, + { + "auxiliary_loss_clip": 0.01502122, + "auxiliary_loss_mlp": 0.01272774, + "balance_loss_clip": 1.13608456, + "balance_loss_mlp": 1.0232923, + "epoch": 0.560452113268803, + "flos": 18547510452480.0, + "grad_norm": 2.8905558194927052, + "language_loss": 0.8696022, + "learning_rate": 1.7066112894525935e-06, + "loss": 0.89735115, + "num_input_tokens_seen": 100555910, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.4921875, + "step": 4661, + "time_per_iteration": 3.0829944610595703 + }, + { + "auxiliary_loss_clip": 0.01502453, + "auxiliary_loss_mlp": 0.0127615, + "balance_loss_clip": 1.13403952, + "balance_loss_mlp": 1.02914846, + "epoch": 0.5605723561594421, + "flos": 25267006169280.0, + "grad_norm": 1.5913910660424455, + "language_loss": 0.7257539, + "learning_rate": 1.7058407670970177e-06, + "loss": 0.75353986, + "num_input_tokens_seen": 100577385, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 2.46679688, + "step": 4662, + "time_per_iteration": 3.0997233390808105 + }, + { + "auxiliary_loss_clip": 0.01506598, + "auxiliary_loss_mlp": 0.01276666, + "balance_loss_clip": 1.13907969, + "balance_loss_mlp": 1.02832949, + "epoch": 0.5606925990500812, + "flos": 20597054836320.0, + "grad_norm": 2.199603203843051, + "language_loss": 0.61519867, + "learning_rate": 1.7050702893651643e-06, + "loss": 0.64303124, + "num_input_tokens_seen": 100596965, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 2.48046875, + "step": 4663, + "time_per_iteration": 3.0509016513824463 + }, + { + "auxiliary_loss_clip": 0.01500135, + "auxiliary_loss_mlp": 0.01280098, + "balance_loss_clip": 1.13309264, + "balance_loss_mlp": 1.03042603, + "epoch": 0.5608128419407202, + "flos": 35008913410560.0, + "grad_norm": 2.3039935516508523, + "language_loss": 0.75677419, + "learning_rate": 1.7042998563739134e-06, + "loss": 0.78457648, + "num_input_tokens_seen": 100615315, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.49414062, + "step": 4664, + "time_per_iteration": 3.05778431892395 + }, + { + "auxiliary_loss_clip": 0.0150352, + "auxiliary_loss_mlp": 0.01289815, + "balance_loss_clip": 1.1362834, + "balance_loss_mlp": 1.03804517, + "epoch": 0.5609330848313594, + "flos": 24641868042720.0, + "grad_norm": 2.291098844539802, + "language_loss": 0.71795875, + "learning_rate": 1.703529468240139e-06, + "loss": 0.74589205, + "num_input_tokens_seen": 100634185, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.515625, + "step": 4665, + "time_per_iteration": 3.0122244358062744 + }, + { + "auxiliary_loss_clip": 0.01500463, + "auxiliary_loss_mlp": 0.01274948, + "balance_loss_clip": 1.13377011, + "balance_loss_mlp": 1.02546692, + "epoch": 0.5610533277219985, + "flos": 18764461713600.0, + "grad_norm": 3.703667947269474, + "language_loss": 0.73548484, + "learning_rate": 1.7027591250807088e-06, + "loss": 0.76323891, + "num_input_tokens_seen": 100651360, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.4921875, + "step": 4666, + "time_per_iteration": 3.1341495513916016 + }, + { + "auxiliary_loss_clip": 0.01501556, + "auxiliary_loss_mlp": 0.01280531, + "balance_loss_clip": 1.13427854, + "balance_loss_mlp": 1.03047752, + "epoch": 0.5611735706126375, + "flos": 15014163584160.0, + "grad_norm": 2.386046668960851, + "language_loss": 0.8473863, + "learning_rate": 1.7019888270124825e-06, + "loss": 0.87520713, + "num_input_tokens_seen": 100668525, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.49804688, + "step": 4667, + "time_per_iteration": 3.036344051361084 + }, + { + "auxiliary_loss_clip": 0.01510638, + "auxiliary_loss_mlp": 0.0127929, + "balance_loss_clip": 1.14553452, + "balance_loss_mlp": 1.03152514, + "epoch": 0.5612938135032767, + "flos": 16469823009600.0, + "grad_norm": 5.046189981262124, + "language_loss": 0.82215881, + "learning_rate": 1.7012185741523147e-06, + "loss": 0.85005814, + "num_input_tokens_seen": 100684850, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.47460938, + "step": 4668, + "time_per_iteration": 2.956664800643921 + }, + { + "auxiliary_loss_clip": 0.0150453, + "auxiliary_loss_mlp": 0.01272532, + "balance_loss_clip": 1.13794804, + "balance_loss_mlp": 1.02228773, + "epoch": 0.5614140563939157, + "flos": 25668821103840.0, + "grad_norm": 2.043399098179536, + "language_loss": 0.62958324, + "learning_rate": 1.7004483666170514e-06, + "loss": 0.65735382, + "num_input_tokens_seen": 100705345, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 2.5, + "step": 4669, + "time_per_iteration": 3.0330190658569336 + }, + { + "auxiliary_loss_clip": 0.01499785, + "auxiliary_loss_mlp": 0.01278082, + "balance_loss_clip": 1.13279867, + "balance_loss_mlp": 1.02955389, + "epoch": 0.5615342992845548, + "flos": 24719886996480.0, + "grad_norm": 2.227755164572331, + "language_loss": 0.8031677, + "learning_rate": 1.699678204523533e-06, + "loss": 0.83094633, + "num_input_tokens_seen": 100725210, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 2.48242188, + "step": 4670, + "time_per_iteration": 3.0342094898223877 + }, + { + "auxiliary_loss_clip": 0.01508217, + "auxiliary_loss_mlp": 0.01281854, + "balance_loss_clip": 1.14249182, + "balance_loss_mlp": 1.0314194, + "epoch": 0.5616545421751938, + "flos": 22018009631040.0, + "grad_norm": 4.635600946157366, + "language_loss": 0.6907661, + "learning_rate": 1.6989080879885918e-06, + "loss": 0.71866679, + "num_input_tokens_seen": 100743070, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.50195312, + "step": 4671, + "time_per_iteration": 2.9596505165100098 + }, + { + "auxiliary_loss_clip": 0.01491607, + "auxiliary_loss_mlp": 0.01198929, + "balance_loss_clip": 1.12816477, + "balance_loss_mlp": 0.99827576, + "epoch": 0.561774785065833, + "flos": 53766358050240.0, + "grad_norm": 0.8954889876345564, + "language_loss": 0.61004156, + "learning_rate": 1.6981380171290544e-06, + "loss": 0.63694692, + "num_input_tokens_seen": 100804095, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.00390625, + "step": 4672, + "time_per_iteration": 3.486778497695923 + }, + { + "auxiliary_loss_clip": 0.0149807, + "auxiliary_loss_mlp": 0.01274907, + "balance_loss_clip": 1.13127649, + "balance_loss_mlp": 1.02657032, + "epoch": 0.5618950279564721, + "flos": 19751741555040.0, + "grad_norm": 1.9141333910022396, + "language_loss": 0.7473014, + "learning_rate": 1.6973679920617396e-06, + "loss": 0.77503115, + "num_input_tokens_seen": 100821630, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.48046875, + "step": 4673, + "time_per_iteration": 3.0030899047851562 + }, + { + "auxiliary_loss_clip": 0.01505086, + "auxiliary_loss_mlp": 0.01275401, + "balance_loss_clip": 1.13920856, + "balance_loss_mlp": 1.02496576, + "epoch": 0.5620152708471111, + "flos": 16802494179840.0, + "grad_norm": 4.472073305946217, + "language_loss": 0.85061586, + "learning_rate": 1.6965980129034603e-06, + "loss": 0.87842071, + "num_input_tokens_seen": 100839015, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.50195312, + "step": 4674, + "time_per_iteration": 3.0446786880493164 + }, + { + "auxiliary_loss_clip": 0.01502366, + "auxiliary_loss_mlp": 0.01279511, + "balance_loss_clip": 1.1362648, + "balance_loss_mlp": 1.03308153, + "epoch": 0.5621355137377503, + "flos": 26800267338720.0, + "grad_norm": 1.6570678690667404, + "language_loss": 0.76671052, + "learning_rate": 1.6958280797710209e-06, + "loss": 0.79452932, + "num_input_tokens_seen": 100860940, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.4609375, + "step": 4675, + "time_per_iteration": 3.1027376651763916 + }, + { + "auxiliary_loss_clip": 0.01498915, + "auxiliary_loss_mlp": 0.01200363, + "balance_loss_clip": 1.13579798, + "balance_loss_mlp": 1.00047302, + "epoch": 0.5622557566283893, + "flos": 61213664515680.0, + "grad_norm": 0.7181082023487376, + "language_loss": 0.54670572, + "learning_rate": 1.6950581927812198e-06, + "loss": 0.57369852, + "num_input_tokens_seen": 100920510, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 1.99609375, + "step": 4676, + "time_per_iteration": 3.289135217666626 + }, + { + "auxiliary_loss_clip": 0.01507885, + "auxiliary_loss_mlp": 0.01274343, + "balance_loss_clip": 1.14280128, + "balance_loss_mlp": 1.02772295, + "epoch": 0.5623759995190284, + "flos": 26470933846560.0, + "grad_norm": 1.999174492018843, + "language_loss": 0.78781372, + "learning_rate": 1.6942883520508486e-06, + "loss": 0.81563604, + "num_input_tokens_seen": 100939245, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.46289062, + "step": 4677, + "time_per_iteration": 3.128037214279175 + }, + { + "auxiliary_loss_clip": 0.01504285, + "auxiliary_loss_mlp": 0.01278075, + "balance_loss_clip": 1.13595843, + "balance_loss_mlp": 1.03393412, + "epoch": 0.5624962424096676, + "flos": 19392937518240.0, + "grad_norm": 2.6111148762037923, + "language_loss": 0.77028215, + "learning_rate": 1.693518557696691e-06, + "loss": 0.79810572, + "num_input_tokens_seen": 100958385, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 2.4375, + "step": 4678, + "time_per_iteration": 3.9948112964630127 + }, + { + "auxiliary_loss_clip": 0.01500995, + "auxiliary_loss_mlp": 0.01269896, + "balance_loss_clip": 1.13513064, + "balance_loss_mlp": 1.02270389, + "epoch": 0.5626164853003066, + "flos": 20669422494240.0, + "grad_norm": 3.0198147616043323, + "language_loss": 0.88948715, + "learning_rate": 1.6927488098355252e-06, + "loss": 0.91719615, + "num_input_tokens_seen": 100976015, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.46875, + "step": 4679, + "time_per_iteration": 3.933378219604492 + }, + { + "auxiliary_loss_clip": 0.01502469, + "auxiliary_loss_mlp": 0.01209137, + "balance_loss_clip": 1.13894248, + "balance_loss_mlp": 1.00848389, + "epoch": 0.5627367281909457, + "flos": 62772224132160.0, + "grad_norm": 0.9133434006404169, + "language_loss": 0.63086164, + "learning_rate": 1.6919791085841201e-06, + "loss": 0.6579777, + "num_input_tokens_seen": 101033425, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.0078125, + "step": 4680, + "time_per_iteration": 3.323669672012329 + }, + { + "auxiliary_loss_clip": 0.01500564, + "auxiliary_loss_mlp": 0.01281471, + "balance_loss_clip": 1.133376, + "balance_loss_mlp": 1.03237152, + "epoch": 0.5628569710815848, + "flos": 12788820357120.0, + "grad_norm": 2.30174051506065, + "language_loss": 0.79240459, + "learning_rate": 1.6912094540592396e-06, + "loss": 0.82022494, + "num_input_tokens_seen": 101048945, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 2.48828125, + "step": 4681, + "time_per_iteration": 3.0517406463623047 + }, + { + "auxiliary_loss_clip": 0.01506374, + "auxiliary_loss_mlp": 0.01270611, + "balance_loss_clip": 1.13907743, + "balance_loss_mlp": 1.02132082, + "epoch": 0.5629772139722239, + "flos": 13763242552320.0, + "grad_norm": 2.7454789831112314, + "language_loss": 0.81395388, + "learning_rate": 1.6904398463776393e-06, + "loss": 0.84172374, + "num_input_tokens_seen": 101062745, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.49023438, + "step": 4682, + "time_per_iteration": 3.825690984725952 + }, + { + "auxiliary_loss_clip": 0.0150477, + "auxiliary_loss_mlp": 0.01267187, + "balance_loss_clip": 1.14004469, + "balance_loss_mlp": 1.01827776, + "epoch": 0.5630974568628629, + "flos": 21469828469760.0, + "grad_norm": 1.7890675795212114, + "language_loss": 0.72577041, + "learning_rate": 1.6896702856560683e-06, + "loss": 0.75348997, + "num_input_tokens_seen": 101081840, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.48632812, + "step": 4683, + "time_per_iteration": 2.977308511734009 + }, + { + "auxiliary_loss_clip": 0.01506152, + "auxiliary_loss_mlp": 0.01279793, + "balance_loss_clip": 1.14085591, + "balance_loss_mlp": 1.03107524, + "epoch": 0.5632176997535021, + "flos": 14247627819840.0, + "grad_norm": 2.6734512127996224, + "language_loss": 0.69095945, + "learning_rate": 1.6889007720112677e-06, + "loss": 0.7188189, + "num_input_tokens_seen": 101099585, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.484375, + "step": 4684, + "time_per_iteration": 3.0013749599456787 + }, + { + "auxiliary_loss_clip": 0.01503226, + "auxiliary_loss_mlp": 0.01282629, + "balance_loss_clip": 1.13764942, + "balance_loss_mlp": 1.03448272, + "epoch": 0.5633379426441412, + "flos": 20814271594560.0, + "grad_norm": 1.694068815370014, + "language_loss": 0.7778585, + "learning_rate": 1.6881313055599734e-06, + "loss": 0.80571705, + "num_input_tokens_seen": 101119515, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 2.47851562, + "step": 4685, + "time_per_iteration": 2.9499294757843018 + }, + { + "auxiliary_loss_clip": 0.01502326, + "auxiliary_loss_mlp": 0.01286368, + "balance_loss_clip": 1.13709211, + "balance_loss_mlp": 1.03536081, + "epoch": 0.5634581855347802, + "flos": 22603057328160.0, + "grad_norm": 3.177771781948745, + "language_loss": 0.82454062, + "learning_rate": 1.6873618864189117e-06, + "loss": 0.8524276, + "num_input_tokens_seen": 101135285, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.5078125, + "step": 4686, + "time_per_iteration": 2.9778897762298584 + }, + { + "auxiliary_loss_clip": 0.01501679, + "auxiliary_loss_mlp": 0.01282742, + "balance_loss_clip": 1.13447356, + "balance_loss_mlp": 1.03421426, + "epoch": 0.5635784284254194, + "flos": 21509122407840.0, + "grad_norm": 2.930578499850977, + "language_loss": 0.78154635, + "learning_rate": 1.686592514704803e-06, + "loss": 0.80939054, + "num_input_tokens_seen": 101152680, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 2.48242188, + "step": 4687, + "time_per_iteration": 3.7380523681640625 + }, + { + "auxiliary_loss_clip": 0.01506261, + "auxiliary_loss_mlp": 0.0126142, + "balance_loss_clip": 1.14023352, + "balance_loss_mlp": 1.01689792, + "epoch": 0.5636986713160584, + "flos": 19829457083520.0, + "grad_norm": 3.1093049357535576, + "language_loss": 0.7072804, + "learning_rate": 1.685823190534361e-06, + "loss": 0.7349571, + "num_input_tokens_seen": 101170920, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.44140625, + "step": 4688, + "time_per_iteration": 2.9147160053253174 + }, + { + "auxiliary_loss_clip": 0.01504348, + "auxiliary_loss_mlp": 0.01278817, + "balance_loss_clip": 1.13828373, + "balance_loss_mlp": 1.02723813, + "epoch": 0.5638189142066975, + "flos": 19794524883840.0, + "grad_norm": 1.9573768015263633, + "language_loss": 0.83833873, + "learning_rate": 1.6850539140242907e-06, + "loss": 0.86617041, + "num_input_tokens_seen": 101190180, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 2.51367188, + "step": 4689, + "time_per_iteration": 3.0516903400421143 + }, + { + "auxiliary_loss_clip": 0.01502414, + "auxiliary_loss_mlp": 0.01278681, + "balance_loss_clip": 1.13763344, + "balance_loss_mlp": 1.03110707, + "epoch": 0.5639391570973367, + "flos": 22896017350560.0, + "grad_norm": 2.4616161602683886, + "language_loss": 0.82311332, + "learning_rate": 1.684284685291292e-06, + "loss": 0.85092425, + "num_input_tokens_seen": 101211825, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.47265625, + "step": 4690, + "time_per_iteration": 3.0812759399414062 + }, + { + "auxiliary_loss_clip": 0.01506116, + "auxiliary_loss_mlp": 0.01285202, + "balance_loss_clip": 1.14115572, + "balance_loss_mlp": 1.0322876, + "epoch": 0.5640593999879757, + "flos": 23729041908000.0, + "grad_norm": 2.1454116664834917, + "language_loss": 0.81515008, + "learning_rate": 1.683515504452055e-06, + "loss": 0.84306324, + "num_input_tokens_seen": 101229200, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.52734375, + "step": 4691, + "time_per_iteration": 3.0954511165618896 + }, + { + "auxiliary_loss_clip": 0.01495803, + "auxiliary_loss_mlp": 0.01277402, + "balance_loss_clip": 1.13057709, + "balance_loss_mlp": 1.02639532, + "epoch": 0.5641796428786148, + "flos": 22712291521920.0, + "grad_norm": 1.85803749458177, + "language_loss": 0.66417456, + "learning_rate": 1.6827463716232648e-06, + "loss": 0.69190669, + "num_input_tokens_seen": 101249860, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.5078125, + "step": 4692, + "time_per_iteration": 3.0429766178131104 + }, + { + "auxiliary_loss_clip": 0.01507316, + "auxiliary_loss_mlp": 0.01281451, + "balance_loss_clip": 1.14247227, + "balance_loss_mlp": 1.03177905, + "epoch": 0.5642998857692539, + "flos": 19794031817760.0, + "grad_norm": 1.885540990336443, + "language_loss": 0.75859261, + "learning_rate": 1.6819772869215972e-06, + "loss": 0.78648025, + "num_input_tokens_seen": 101268940, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.49414062, + "step": 4693, + "time_per_iteration": 2.9748690128326416 + }, + { + "auxiliary_loss_clip": 0.01507587, + "auxiliary_loss_mlp": 0.01270597, + "balance_loss_clip": 1.14278007, + "balance_loss_mlp": 1.02397656, + "epoch": 0.564420128659893, + "flos": 23188180881600.0, + "grad_norm": 2.1786647807524715, + "language_loss": 0.81863737, + "learning_rate": 1.6812082504637228e-06, + "loss": 0.84641916, + "num_input_tokens_seen": 101290260, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.46289062, + "step": 4694, + "time_per_iteration": 3.065655469894409 + }, + { + "auxiliary_loss_clip": 0.01506808, + "auxiliary_loss_mlp": 0.01280901, + "balance_loss_clip": 1.14174032, + "balance_loss_mlp": 1.03351831, + "epoch": 0.564540371550532, + "flos": 23260510611360.0, + "grad_norm": 1.6365682392244412, + "language_loss": 0.74265361, + "learning_rate": 1.6804392623663025e-06, + "loss": 0.7705307, + "num_input_tokens_seen": 101311465, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.47070312, + "step": 4695, + "time_per_iteration": 3.0844311714172363 + }, + { + "auxiliary_loss_clip": 0.01503007, + "auxiliary_loss_mlp": 0.01268504, + "balance_loss_clip": 1.13757706, + "balance_loss_mlp": 1.02341008, + "epoch": 0.5646606144411712, + "flos": 25012429809120.0, + "grad_norm": 1.9704070392123882, + "language_loss": 0.77877772, + "learning_rate": 1.6796703227459935e-06, + "loss": 0.80649287, + "num_input_tokens_seen": 101329420, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.44726562, + "step": 4696, + "time_per_iteration": 2.992739677429199 + }, + { + "auxiliary_loss_clip": 0.01499073, + "auxiliary_loss_mlp": 0.01265531, + "balance_loss_clip": 1.13409829, + "balance_loss_mlp": 1.02100897, + "epoch": 0.5647808573318103, + "flos": 36542667646080.0, + "grad_norm": 1.8326676059335623, + "language_loss": 0.75858361, + "learning_rate": 1.6789014317194407e-06, + "loss": 0.78622961, + "num_input_tokens_seen": 101350900, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.44140625, + "step": 4697, + "time_per_iteration": 3.1609573364257812 + }, + { + "auxiliary_loss_clip": 0.0150921, + "auxiliary_loss_mlp": 0.01279213, + "balance_loss_clip": 1.1439805, + "balance_loss_mlp": 1.0287776, + "epoch": 0.5649011002224493, + "flos": 22530651742080.0, + "grad_norm": 3.8518881066805575, + "language_loss": 0.7309913, + "learning_rate": 1.6781325894032853e-06, + "loss": 0.75887555, + "num_input_tokens_seen": 101369860, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.50195312, + "step": 4698, + "time_per_iteration": 3.0134947299957275 + }, + { + "auxiliary_loss_clip": 0.01501921, + "auxiliary_loss_mlp": 0.01277039, + "balance_loss_clip": 1.13787317, + "balance_loss_mlp": 1.02870178, + "epoch": 0.5650213431130885, + "flos": 18517129632000.0, + "grad_norm": 2.6494837290878093, + "language_loss": 0.92221332, + "learning_rate": 1.6773637959141608e-06, + "loss": 0.95000291, + "num_input_tokens_seen": 101386835, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.48046875, + "step": 4699, + "time_per_iteration": 3.005948543548584 + }, + { + "auxiliary_loss_clip": 0.01503962, + "auxiliary_loss_mlp": 0.01268337, + "balance_loss_clip": 1.13865352, + "balance_loss_mlp": 1.02228892, + "epoch": 0.5651415860037275, + "flos": 17528029238880.0, + "grad_norm": 2.218287755306478, + "language_loss": 0.66560757, + "learning_rate": 1.6765950513686915e-06, + "loss": 0.69333053, + "num_input_tokens_seen": 101404945, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 2.45703125, + "step": 4700, + "time_per_iteration": 2.9762279987335205 + }, + { + "auxiliary_loss_clip": 0.01507094, + "auxiliary_loss_mlp": 0.01278582, + "balance_loss_clip": 1.14247644, + "balance_loss_mlp": 1.02871895, + "epoch": 0.5652618288943666, + "flos": 25522379020800.0, + "grad_norm": 2.1299863185287276, + "language_loss": 0.76169467, + "learning_rate": 1.675826355883496e-06, + "loss": 0.78955138, + "num_input_tokens_seen": 101424160, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.49609375, + "step": 4701, + "time_per_iteration": 3.0674502849578857 + }, + { + "auxiliary_loss_clip": 0.01504845, + "auxiliary_loss_mlp": 0.01274893, + "balance_loss_clip": 1.13941455, + "balance_loss_mlp": 1.02579331, + "epoch": 0.5653820717850057, + "flos": 19685176905600.0, + "grad_norm": 2.2467249212837217, + "language_loss": 0.79051703, + "learning_rate": 1.6750577095751848e-06, + "loss": 0.81831443, + "num_input_tokens_seen": 101443270, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.48828125, + "step": 4702, + "time_per_iteration": 3.093358278274536 + }, + { + "auxiliary_loss_clip": 0.01506336, + "auxiliary_loss_mlp": 0.01277607, + "balance_loss_clip": 1.14322495, + "balance_loss_mlp": 1.0321312, + "epoch": 0.5655023146756448, + "flos": 26982172615680.0, + "grad_norm": 2.4267650269430523, + "language_loss": 0.72798544, + "learning_rate": 1.6742891125603605e-06, + "loss": 0.75582492, + "num_input_tokens_seen": 101464175, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.45117188, + "step": 4703, + "time_per_iteration": 3.045119285583496 + }, + { + "auxiliary_loss_clip": 0.01512437, + "auxiliary_loss_mlp": 0.01272239, + "balance_loss_clip": 1.14752722, + "balance_loss_mlp": 1.0254277, + "epoch": 0.5656225575662839, + "flos": 27671637630240.0, + "grad_norm": 2.092381514012535, + "language_loss": 0.71975946, + "learning_rate": 1.6735205649556185e-06, + "loss": 0.74760628, + "num_input_tokens_seen": 101484045, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.46484375, + "step": 4704, + "time_per_iteration": 3.0624325275421143 + }, + { + "auxiliary_loss_clip": 0.01506686, + "auxiliary_loss_mlp": 0.01272197, + "balance_loss_clip": 1.14272475, + "balance_loss_mlp": 1.02271616, + "epoch": 0.5657428004569229, + "flos": 24351866416800.0, + "grad_norm": 1.6874855613249287, + "language_loss": 0.84975225, + "learning_rate": 1.6727520668775476e-06, + "loss": 0.87754107, + "num_input_tokens_seen": 101504330, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.4921875, + "step": 4705, + "time_per_iteration": 2.997291326522827 + }, + { + "auxiliary_loss_clip": 0.01507406, + "auxiliary_loss_mlp": 0.01267624, + "balance_loss_clip": 1.14435458, + "balance_loss_mlp": 1.01966834, + "epoch": 0.5658630433475621, + "flos": 21946172967360.0, + "grad_norm": 1.804440403339895, + "language_loss": 0.75647169, + "learning_rate": 1.6719836184427275e-06, + "loss": 0.78422201, + "num_input_tokens_seen": 101524635, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.4765625, + "step": 4706, + "time_per_iteration": 3.882537603378296 + }, + { + "auxiliary_loss_clip": 0.01508802, + "auxiliary_loss_mlp": 0.01272403, + "balance_loss_clip": 1.14671648, + "balance_loss_mlp": 1.02673697, + "epoch": 0.5659832862382012, + "flos": 30411633160800.0, + "grad_norm": 1.9689754227887906, + "language_loss": 0.64896679, + "learning_rate": 1.671215219767733e-06, + "loss": 0.67677885, + "num_input_tokens_seen": 101544095, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.453125, + "step": 4707, + "time_per_iteration": 3.9330978393554688 + }, + { + "auxiliary_loss_clip": 0.01512716, + "auxiliary_loss_mlp": 0.01272013, + "balance_loss_clip": 1.14938414, + "balance_loss_mlp": 1.02386713, + "epoch": 0.5661035291288402, + "flos": 13189876728480.0, + "grad_norm": 2.992091733442071, + "language_loss": 0.75907308, + "learning_rate": 1.670446870969127e-06, + "loss": 0.78692043, + "num_input_tokens_seen": 101561760, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.47851562, + "step": 4708, + "time_per_iteration": 3.03891658782959 + }, + { + "auxiliary_loss_clip": 0.01502837, + "auxiliary_loss_mlp": 0.01284057, + "balance_loss_clip": 1.13937056, + "balance_loss_mlp": 1.03514826, + "epoch": 0.5662237720194794, + "flos": 16144851255840.0, + "grad_norm": 2.1062715512038896, + "language_loss": 0.80265284, + "learning_rate": 1.6696785721634685e-06, + "loss": 0.83052182, + "num_input_tokens_seen": 101576245, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.48632812, + "step": 4709, + "time_per_iteration": 2.9723527431488037 + }, + { + "auxiliary_loss_clip": 0.01514441, + "auxiliary_loss_mlp": 0.01278972, + "balance_loss_clip": 1.14966249, + "balance_loss_mlp": 1.03235173, + "epoch": 0.5663440149101184, + "flos": 17678833060320.0, + "grad_norm": 1.8687602165595727, + "language_loss": 0.73782372, + "learning_rate": 1.6689103234673086e-06, + "loss": 0.76575786, + "num_input_tokens_seen": 101594565, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.46289062, + "step": 4710, + "time_per_iteration": 3.9366090297698975 + }, + { + "auxiliary_loss_clip": 0.01505216, + "auxiliary_loss_mlp": 0.01282744, + "balance_loss_clip": 1.14143181, + "balance_loss_mlp": 1.03478909, + "epoch": 0.5664642578007575, + "flos": 23370617152800.0, + "grad_norm": 1.969145024574061, + "language_loss": 0.76884747, + "learning_rate": 1.668142124997189e-06, + "loss": 0.79672712, + "num_input_tokens_seen": 101614225, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.4765625, + "step": 4711, + "time_per_iteration": 3.05781626701355 + }, + { + "auxiliary_loss_clip": 0.01492734, + "auxiliary_loss_mlp": 0.01215073, + "balance_loss_clip": 1.1327467, + "balance_loss_mlp": 1.01289368, + "epoch": 0.5665845006913967, + "flos": 65523560182560.0, + "grad_norm": 0.7522094855011691, + "language_loss": 0.59651804, + "learning_rate": 1.6673739768696453e-06, + "loss": 0.62359607, + "num_input_tokens_seen": 101680795, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.0234375, + "step": 4712, + "time_per_iteration": 3.556166648864746 + }, + { + "auxiliary_loss_clip": 0.01516202, + "auxiliary_loss_mlp": 0.01285555, + "balance_loss_clip": 1.15416205, + "balance_loss_mlp": 1.03721809, + "epoch": 0.5667047435820357, + "flos": 26143307121600.0, + "grad_norm": 2.1121077523422525, + "language_loss": 0.7784881, + "learning_rate": 1.6666058792012052e-06, + "loss": 0.80650568, + "num_input_tokens_seen": 101701680, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.48046875, + "step": 4713, + "time_per_iteration": 3.082622766494751 + }, + { + "auxiliary_loss_clip": 0.01489224, + "auxiliary_loss_mlp": 0.01199402, + "balance_loss_clip": 1.1296531, + "balance_loss_mlp": 0.99951172, + "epoch": 0.5668249864726748, + "flos": 71874504325440.0, + "grad_norm": 0.9033415161070244, + "language_loss": 0.68711078, + "learning_rate": 1.6658378321083878e-06, + "loss": 0.71399701, + "num_input_tokens_seen": 101766010, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.0, + "step": 4714, + "time_per_iteration": 4.329547882080078 + }, + { + "auxiliary_loss_clip": 0.01498953, + "auxiliary_loss_mlp": 0.01277441, + "balance_loss_clip": 1.13452613, + "balance_loss_mlp": 1.02891397, + "epoch": 0.5669452293633139, + "flos": 22197601290240.0, + "grad_norm": 1.8063328845418571, + "language_loss": 0.82572484, + "learning_rate": 1.6650698357077055e-06, + "loss": 0.85348874, + "num_input_tokens_seen": 101783055, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.48242188, + "step": 4715, + "time_per_iteration": 3.059903621673584 + }, + { + "auxiliary_loss_clip": 0.01504533, + "auxiliary_loss_mlp": 0.01279063, + "balance_loss_clip": 1.14082313, + "balance_loss_mlp": 1.02977228, + "epoch": 0.567065472253953, + "flos": 18225155741760.0, + "grad_norm": 4.775746847293503, + "language_loss": 0.80737668, + "learning_rate": 1.6643018901156632e-06, + "loss": 0.83521259, + "num_input_tokens_seen": 101802150, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.49023438, + "step": 4716, + "time_per_iteration": 3.039760112762451 + }, + { + "auxiliary_loss_clip": 0.01504365, + "auxiliary_loss_mlp": 0.01276772, + "balance_loss_clip": 1.14115119, + "balance_loss_mlp": 1.02881694, + "epoch": 0.567185715144592, + "flos": 20373390290880.0, + "grad_norm": 2.49502314639612, + "language_loss": 0.79606104, + "learning_rate": 1.6635339954487566e-06, + "loss": 0.82387245, + "num_input_tokens_seen": 101818025, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.4765625, + "step": 4717, + "time_per_iteration": 3.052494525909424 + }, + { + "auxiliary_loss_clip": 0.01508204, + "auxiliary_loss_mlp": 0.01279386, + "balance_loss_clip": 1.14529943, + "balance_loss_mlp": 1.03200269, + "epoch": 0.5673059580352312, + "flos": 23223568219200.0, + "grad_norm": 1.8903291344390383, + "language_loss": 0.81895876, + "learning_rate": 1.6627661518234765e-06, + "loss": 0.8468346, + "num_input_tokens_seen": 101837280, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.47070312, + "step": 4718, + "time_per_iteration": 2.9325404167175293 + }, + { + "auxiliary_loss_clip": 0.01505261, + "auxiliary_loss_mlp": 0.01275435, + "balance_loss_clip": 1.14211822, + "balance_loss_mlp": 1.02633524, + "epoch": 0.5674262009258703, + "flos": 21721408505280.0, + "grad_norm": 1.9523986340374302, + "language_loss": 0.85630143, + "learning_rate": 1.661998359356302e-06, + "loss": 0.88410842, + "num_input_tokens_seen": 101856310, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.48828125, + "step": 4719, + "time_per_iteration": 3.0182383060455322 + }, + { + "auxiliary_loss_clip": 0.01485579, + "auxiliary_loss_mlp": 0.01198547, + "balance_loss_clip": 1.12699068, + "balance_loss_mlp": 0.99865723, + "epoch": 0.5675464438165093, + "flos": 67476652526880.0, + "grad_norm": 0.74297109370754, + "language_loss": 0.55635858, + "learning_rate": 1.6612306181637077e-06, + "loss": 0.58319986, + "num_input_tokens_seen": 101915635, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 1.99609375, + "step": 4720, + "time_per_iteration": 3.4725422859191895 + }, + { + "auxiliary_loss_clip": 0.01503674, + "auxiliary_loss_mlp": 0.01284501, + "balance_loss_clip": 1.14046741, + "balance_loss_mlp": 1.03711796, + "epoch": 0.5676666867071485, + "flos": 18881129826720.0, + "grad_norm": 2.4250246542770206, + "language_loss": 0.65861231, + "learning_rate": 1.6604629283621598e-06, + "loss": 0.68649399, + "num_input_tokens_seen": 101933565, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.47070312, + "step": 4721, + "time_per_iteration": 2.9174869060516357 + }, + { + "auxiliary_loss_clip": 0.01503439, + "auxiliary_loss_mlp": 0.01276967, + "balance_loss_clip": 1.13972163, + "balance_loss_mlp": 1.02672315, + "epoch": 0.5677869295977875, + "flos": 33549574953600.0, + "grad_norm": 2.281236237329048, + "language_loss": 0.74673045, + "learning_rate": 1.6596952900681152e-06, + "loss": 0.77453446, + "num_input_tokens_seen": 101954325, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.5, + "step": 4722, + "time_per_iteration": 3.0570483207702637 + }, + { + "auxiliary_loss_clip": 0.01506415, + "auxiliary_loss_mlp": 0.0128808, + "balance_loss_clip": 1.1417613, + "balance_loss_mlp": 1.03726411, + "epoch": 0.5679071724884266, + "flos": 28039734066240.0, + "grad_norm": 2.320947746014876, + "language_loss": 0.8169142, + "learning_rate": 1.658927703398025e-06, + "loss": 0.84485912, + "num_input_tokens_seen": 101974390, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.50585938, + "step": 4723, + "time_per_iteration": 3.137611150741577 + }, + { + "auxiliary_loss_clip": 0.01500859, + "auxiliary_loss_mlp": 0.0127792, + "balance_loss_clip": 1.13802862, + "balance_loss_mlp": 1.03091812, + "epoch": 0.5680274153790658, + "flos": 23552370717120.0, + "grad_norm": 2.444917794530609, + "language_loss": 0.7863546, + "learning_rate": 1.6581601684683309e-06, + "loss": 0.81414241, + "num_input_tokens_seen": 101994815, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.46679688, + "step": 4724, + "time_per_iteration": 3.0025744438171387 + }, + { + "auxiliary_loss_clip": 0.0150746, + "auxiliary_loss_mlp": 0.01284599, + "balance_loss_clip": 1.14299583, + "balance_loss_mlp": 1.03568995, + "epoch": 0.5681476582697048, + "flos": 22457335880160.0, + "grad_norm": 5.469725222720438, + "language_loss": 0.69182843, + "learning_rate": 1.6573926853954674e-06, + "loss": 0.71974903, + "num_input_tokens_seen": 102012400, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.48632812, + "step": 4725, + "time_per_iteration": 2.95894193649292 + }, + { + "auxiliary_loss_clip": 0.01499355, + "auxiliary_loss_mlp": 0.01266944, + "balance_loss_clip": 1.13678181, + "balance_loss_mlp": 1.02127731, + "epoch": 0.5682679011603439, + "flos": 19538962391520.0, + "grad_norm": 2.032835454897938, + "language_loss": 0.8310023, + "learning_rate": 1.6566252542958608e-06, + "loss": 0.85866529, + "num_input_tokens_seen": 102031900, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.453125, + "step": 4726, + "time_per_iteration": 2.9527995586395264 + }, + { + "auxiliary_loss_clip": 0.01504258, + "auxiliary_loss_mlp": 0.01284874, + "balance_loss_clip": 1.14112186, + "balance_loss_mlp": 1.03577399, + "epoch": 0.568388144050983, + "flos": 28767924096480.0, + "grad_norm": 1.727696670120745, + "language_loss": 0.78571206, + "learning_rate": 1.6558578752859305e-06, + "loss": 0.8136034, + "num_input_tokens_seen": 102050860, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.48828125, + "step": 4727, + "time_per_iteration": 3.025491952896118 + }, + { + "auxiliary_loss_clip": 0.01504586, + "auxiliary_loss_mlp": 0.01271518, + "balance_loss_clip": 1.14059711, + "balance_loss_mlp": 1.02413487, + "epoch": 0.5685083869416221, + "flos": 21211269652800.0, + "grad_norm": 2.058840011034251, + "language_loss": 0.78952515, + "learning_rate": 1.6550905484820865e-06, + "loss": 0.81728619, + "num_input_tokens_seen": 102069320, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.47070312, + "step": 4728, + "time_per_iteration": 2.969761610031128 + }, + { + "auxiliary_loss_clip": 0.01500491, + "auxiliary_loss_mlp": 0.01287611, + "balance_loss_clip": 1.13659382, + "balance_loss_mlp": 1.03870225, + "epoch": 0.5686286298322611, + "flos": 24829348759200.0, + "grad_norm": 2.328399241433442, + "language_loss": 0.78886163, + "learning_rate": 1.6543232740007328e-06, + "loss": 0.81674266, + "num_input_tokens_seen": 102086435, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.48632812, + "step": 4729, + "time_per_iteration": 2.9749293327331543 + }, + { + "auxiliary_loss_clip": 0.01502225, + "auxiliary_loss_mlp": 0.01290684, + "balance_loss_clip": 1.13852501, + "balance_loss_mlp": 1.04349136, + "epoch": 0.5687488727229003, + "flos": 26617565570400.0, + "grad_norm": 2.773261617653828, + "language_loss": 0.67544574, + "learning_rate": 1.653556051958263e-06, + "loss": 0.7033748, + "num_input_tokens_seen": 102106115, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.46875, + "step": 4730, + "time_per_iteration": 3.083904981613159 + }, + { + "auxiliary_loss_clip": 0.01508423, + "auxiliary_loss_mlp": 0.01279614, + "balance_loss_clip": 1.14554954, + "balance_loss_mlp": 1.03204036, + "epoch": 0.5688691156135394, + "flos": 20810554634880.0, + "grad_norm": 1.9944533676102345, + "language_loss": 0.73804271, + "learning_rate": 1.6527888824710642e-06, + "loss": 0.76592302, + "num_input_tokens_seen": 102125715, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.47265625, + "step": 4731, + "time_per_iteration": 2.971890449523926 + }, + { + "auxiliary_loss_clip": 0.01504695, + "auxiliary_loss_mlp": 0.01285115, + "balance_loss_clip": 1.14184463, + "balance_loss_mlp": 1.03391695, + "epoch": 0.5689893585041784, + "flos": 25883117393760.0, + "grad_norm": 3.3013111444580328, + "language_loss": 0.76751661, + "learning_rate": 1.6520217656555166e-06, + "loss": 0.79541463, + "num_input_tokens_seen": 102145005, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.50976562, + "step": 4732, + "time_per_iteration": 3.065983295440674 + }, + { + "auxiliary_loss_clip": 0.01505807, + "auxiliary_loss_mlp": 0.01271275, + "balance_loss_clip": 1.14352202, + "balance_loss_mlp": 1.026371, + "epoch": 0.5691096013948175, + "flos": 23479737562080.0, + "grad_norm": 2.898974694973574, + "language_loss": 0.70857739, + "learning_rate": 1.65125470162799e-06, + "loss": 0.73634821, + "num_input_tokens_seen": 102165360, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.4453125, + "step": 4733, + "time_per_iteration": 3.8795266151428223 + }, + { + "auxiliary_loss_clip": 0.01503552, + "auxiliary_loss_mlp": 0.01280513, + "balance_loss_clip": 1.14005733, + "balance_loss_mlp": 1.03293931, + "epoch": 0.5692298442854566, + "flos": 18077462029440.0, + "grad_norm": 2.6340342677442887, + "language_loss": 0.69884288, + "learning_rate": 1.6504876905048485e-06, + "loss": 0.72668362, + "num_input_tokens_seen": 102182320, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.47265625, + "step": 4734, + "time_per_iteration": 3.852959632873535 + }, + { + "auxiliary_loss_clip": 0.01504714, + "auxiliary_loss_mlp": 0.01273286, + "balance_loss_clip": 1.14317417, + "balance_loss_mlp": 1.02685666, + "epoch": 0.5693500871760957, + "flos": 23041549157760.0, + "grad_norm": 1.6977243624553533, + "language_loss": 0.71953338, + "learning_rate": 1.6497207324024464e-06, + "loss": 0.74731338, + "num_input_tokens_seen": 102201220, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.4609375, + "step": 4735, + "time_per_iteration": 2.9678165912628174 + }, + { + "auxiliary_loss_clip": 0.01502582, + "auxiliary_loss_mlp": 0.01275508, + "balance_loss_clip": 1.14001119, + "balance_loss_mlp": 1.02717173, + "epoch": 0.5694703300667348, + "flos": 18991691506080.0, + "grad_norm": 3.043770229234092, + "language_loss": 0.82699388, + "learning_rate": 1.6489538274371305e-06, + "loss": 0.85477477, + "num_input_tokens_seen": 102219825, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.48046875, + "step": 4736, + "time_per_iteration": 2.9626412391662598 + }, + { + "auxiliary_loss_clip": 0.0150935, + "auxiliary_loss_mlp": 0.01276305, + "balance_loss_clip": 1.14729071, + "balance_loss_mlp": 1.03311777, + "epoch": 0.5695905729573739, + "flos": 21910899414240.0, + "grad_norm": 2.218683376447837, + "language_loss": 0.82935011, + "learning_rate": 1.6481869757252396e-06, + "loss": 0.85720664, + "num_input_tokens_seen": 102238160, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.42773438, + "step": 4737, + "time_per_iteration": 3.9634668827056885 + }, + { + "auxiliary_loss_clip": 0.01500722, + "auxiliary_loss_mlp": 0.01286869, + "balance_loss_clip": 1.13764739, + "balance_loss_mlp": 1.04024839, + "epoch": 0.569710815848013, + "flos": 28479249956160.0, + "grad_norm": 2.2437967675195565, + "language_loss": 0.71827102, + "learning_rate": 1.647420177383105e-06, + "loss": 0.74614692, + "num_input_tokens_seen": 102261030, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.46289062, + "step": 4738, + "time_per_iteration": 2.974386215209961 + }, + { + "auxiliary_loss_clip": 0.01510199, + "auxiliary_loss_mlp": 0.0127382, + "balance_loss_clip": 1.14819145, + "balance_loss_mlp": 1.02796245, + "epoch": 0.569831058738652, + "flos": 28368688276800.0, + "grad_norm": 2.149391891430835, + "language_loss": 0.7282263, + "learning_rate": 1.646653432527049e-06, + "loss": 0.7560665, + "num_input_tokens_seen": 102281670, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.45507812, + "step": 4739, + "time_per_iteration": 3.029325485229492 + }, + { + "auxiliary_loss_clip": 0.01501966, + "auxiliary_loss_mlp": 0.0127402, + "balance_loss_clip": 1.13973713, + "balance_loss_mlp": 1.02759099, + "epoch": 0.5699513016292912, + "flos": 25852433148000.0, + "grad_norm": 2.0173808182317408, + "language_loss": 0.74871236, + "learning_rate": 1.645886741273387e-06, + "loss": 0.77647221, + "num_input_tokens_seen": 102303485, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.4609375, + "step": 4740, + "time_per_iteration": 2.998666286468506 + }, + { + "auxiliary_loss_clip": 0.01507441, + "auxiliary_loss_mlp": 0.01283429, + "balance_loss_clip": 1.14265716, + "balance_loss_mlp": 1.03661799, + "epoch": 0.5700715445199303, + "flos": 18039647289600.0, + "grad_norm": 2.3568868718987126, + "language_loss": 0.73842478, + "learning_rate": 1.645120103738424e-06, + "loss": 0.76633352, + "num_input_tokens_seen": 102320995, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.46484375, + "step": 4741, + "time_per_iteration": 2.9396471977233887 + }, + { + "auxiliary_loss_clip": 0.0150947, + "auxiliary_loss_mlp": 0.01277609, + "balance_loss_clip": 1.14617825, + "balance_loss_mlp": 1.03308678, + "epoch": 0.5701917874105693, + "flos": 11474937851040.0, + "grad_norm": 3.7801494130946667, + "language_loss": 0.83871186, + "learning_rate": 1.6443535200384591e-06, + "loss": 0.86658263, + "num_input_tokens_seen": 102339170, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.44140625, + "step": 4742, + "time_per_iteration": 3.761573076248169 + }, + { + "auxiliary_loss_clip": 0.01502117, + "auxiliary_loss_mlp": 0.01277739, + "balance_loss_clip": 1.1376704, + "balance_loss_mlp": 1.03207207, + "epoch": 0.5703120303012085, + "flos": 21763888408800.0, + "grad_norm": 2.9276229249193046, + "language_loss": 0.70626545, + "learning_rate": 1.6435869902897827e-06, + "loss": 0.73406404, + "num_input_tokens_seen": 102357750, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.453125, + "step": 4743, + "time_per_iteration": 3.0526440143585205 + }, + { + "auxiliary_loss_clip": 0.01488651, + "auxiliary_loss_mlp": 0.01198738, + "balance_loss_clip": 1.12907302, + "balance_loss_mlp": 0.99503326, + "epoch": 0.5704322731918475, + "flos": 56752092679680.0, + "grad_norm": 0.8040621030406016, + "language_loss": 0.61873341, + "learning_rate": 1.6428205146086764e-06, + "loss": 0.64560735, + "num_input_tokens_seen": 102419730, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.0390625, + "step": 4744, + "time_per_iteration": 3.5326154232025146 + }, + { + "auxiliary_loss_clip": 0.01503301, + "auxiliary_loss_mlp": 0.01271576, + "balance_loss_clip": 1.13964009, + "balance_loss_mlp": 1.02514696, + "epoch": 0.5705525160824866, + "flos": 20743269350400.0, + "grad_norm": 1.6182499249003532, + "language_loss": 0.70556593, + "learning_rate": 1.6420540931114142e-06, + "loss": 0.73331475, + "num_input_tokens_seen": 102440320, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.4609375, + "step": 4745, + "time_per_iteration": 3.109114646911621 + }, + { + "auxiliary_loss_clip": 0.01506298, + "auxiliary_loss_mlp": 0.01281457, + "balance_loss_clip": 1.14227355, + "balance_loss_mlp": 1.03540874, + "epoch": 0.5706727589731257, + "flos": 18773298974880.0, + "grad_norm": 2.232656728037454, + "language_loss": 0.79039717, + "learning_rate": 1.6412877259142616e-06, + "loss": 0.81827468, + "num_input_tokens_seen": 102460240, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.45703125, + "step": 4746, + "time_per_iteration": 3.0515987873077393 + }, + { + "auxiliary_loss_clip": 0.01509278, + "auxiliary_loss_mlp": 0.01278942, + "balance_loss_clip": 1.14835405, + "balance_loss_mlp": 1.03384733, + "epoch": 0.5707930018637648, + "flos": 27637122640320.0, + "grad_norm": 2.2171418312265074, + "language_loss": 0.74153364, + "learning_rate": 1.6405214131334757e-06, + "loss": 0.76941586, + "num_input_tokens_seen": 102478765, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.44726562, + "step": 4747, + "time_per_iteration": 3.053771495819092 + }, + { + "auxiliary_loss_clip": 0.01506428, + "auxiliary_loss_mlp": 0.01284087, + "balance_loss_clip": 1.14423883, + "balance_loss_mlp": 1.03842092, + "epoch": 0.5709132447544039, + "flos": 27600028535520.0, + "grad_norm": 1.8920272775976859, + "language_loss": 0.79763114, + "learning_rate": 1.6397551548853052e-06, + "loss": 0.82553625, + "num_input_tokens_seen": 102496930, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.453125, + "step": 4748, + "time_per_iteration": 3.142883539199829 + }, + { + "auxiliary_loss_clip": 0.0150897, + "auxiliary_loss_mlp": 0.01283959, + "balance_loss_clip": 1.14464331, + "balance_loss_mlp": 1.03600395, + "epoch": 0.571033487645043, + "flos": 21688069288320.0, + "grad_norm": 2.0759287448372192, + "language_loss": 0.71053565, + "learning_rate": 1.6389889512859917e-06, + "loss": 0.73846495, + "num_input_tokens_seen": 102516590, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.4765625, + "step": 4749, + "time_per_iteration": 3.059070110321045 + }, + { + "auxiliary_loss_clip": 0.01486043, + "auxiliary_loss_mlp": 0.01206573, + "balance_loss_clip": 1.12707257, + "balance_loss_mlp": 1.00668335, + "epoch": 0.5711537305356821, + "flos": 70188504998400.0, + "grad_norm": 0.8155560031727704, + "language_loss": 0.60398865, + "learning_rate": 1.638222802451767e-06, + "loss": 0.63091481, + "num_input_tokens_seen": 102578070, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.0, + "step": 4750, + "time_per_iteration": 3.530555009841919 + }, + { + "auxiliary_loss_clip": 0.01506001, + "auxiliary_loss_mlp": 0.01269853, + "balance_loss_clip": 1.14352691, + "balance_loss_mlp": 1.02971768, + "epoch": 0.5712739734263211, + "flos": 24719811140160.0, + "grad_norm": 1.698392232811627, + "language_loss": 0.75474876, + "learning_rate": 1.6374567084988561e-06, + "loss": 0.7825073, + "num_input_tokens_seen": 102599255, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.39648438, + "step": 4751, + "time_per_iteration": 3.061141014099121 + }, + { + "auxiliary_loss_clip": 0.01513398, + "auxiliary_loss_mlp": 0.01279629, + "balance_loss_clip": 1.15023673, + "balance_loss_mlp": 1.03434432, + "epoch": 0.5713942163169603, + "flos": 26580585250080.0, + "grad_norm": 2.0820346250633284, + "language_loss": 0.76812661, + "learning_rate": 1.6366906695434738e-06, + "loss": 0.79605687, + "num_input_tokens_seen": 102621775, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.44921875, + "step": 4752, + "time_per_iteration": 3.1583008766174316 + }, + { + "auxiliary_loss_clip": 0.01511071, + "auxiliary_loss_mlp": 0.01278857, + "balance_loss_clip": 1.15021646, + "balance_loss_mlp": 1.03280878, + "epoch": 0.5715144592075994, + "flos": 21144591218880.0, + "grad_norm": 2.7143312135520135, + "language_loss": 0.86216599, + "learning_rate": 1.6359246857018275e-06, + "loss": 0.89006525, + "num_input_tokens_seen": 102639305, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.45703125, + "step": 4753, + "time_per_iteration": 3.046269178390503 + }, + { + "auxiliary_loss_clip": 0.01507294, + "auxiliary_loss_mlp": 0.0128838, + "balance_loss_clip": 1.14448619, + "balance_loss_mlp": 1.04309511, + "epoch": 0.5716347020982384, + "flos": 23332461059520.0, + "grad_norm": 2.020860196207286, + "language_loss": 0.78168857, + "learning_rate": 1.6351587570901178e-06, + "loss": 0.8096453, + "num_input_tokens_seen": 102659430, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.44921875, + "step": 4754, + "time_per_iteration": 2.983649492263794 + }, + { + "auxiliary_loss_clip": 0.01510841, + "auxiliary_loss_mlp": 0.01286129, + "balance_loss_clip": 1.14755654, + "balance_loss_mlp": 1.04103434, + "epoch": 0.5717549449888776, + "flos": 17010911604960.0, + "grad_norm": 3.4510997289856817, + "language_loss": 0.75740826, + "learning_rate": 1.634392883824534e-06, + "loss": 0.78537786, + "num_input_tokens_seen": 102671430, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.44726562, + "step": 4755, + "time_per_iteration": 2.9489235877990723 + }, + { + "auxiliary_loss_clip": 0.01510372, + "auxiliary_loss_mlp": 0.01280824, + "balance_loss_clip": 1.14603162, + "balance_loss_mlp": 1.03344083, + "epoch": 0.5718751878795166, + "flos": 35521100383680.0, + "grad_norm": 2.1085807821403493, + "language_loss": 0.67908263, + "learning_rate": 1.6336270660212595e-06, + "loss": 0.70699453, + "num_input_tokens_seen": 102693025, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.47070312, + "step": 4756, + "time_per_iteration": 3.0374159812927246 + }, + { + "auxiliary_loss_clip": 0.01510718, + "auxiliary_loss_mlp": 0.01295118, + "balance_loss_clip": 1.14758205, + "balance_loss_mlp": 1.04659033, + "epoch": 0.5719954307701557, + "flos": 38616827770080.0, + "grad_norm": 2.2689405824783893, + "language_loss": 0.65568244, + "learning_rate": 1.6328613037964676e-06, + "loss": 0.68374085, + "num_input_tokens_seen": 102716090, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.48242188, + "step": 4757, + "time_per_iteration": 3.137159585952759 + }, + { + "auxiliary_loss_clip": 0.01505013, + "auxiliary_loss_mlp": 0.01278101, + "balance_loss_clip": 1.14146543, + "balance_loss_mlp": 1.0339601, + "epoch": 0.5721156736607949, + "flos": 20633200737120.0, + "grad_norm": 1.815218049917593, + "language_loss": 0.67877179, + "learning_rate": 1.6320955972663241e-06, + "loss": 0.70660293, + "num_input_tokens_seen": 102735685, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.4375, + "step": 4758, + "time_per_iteration": 3.03171706199646 + }, + { + "auxiliary_loss_clip": 0.01509003, + "auxiliary_loss_mlp": 0.01284005, + "balance_loss_clip": 1.14512348, + "balance_loss_mlp": 1.03662145, + "epoch": 0.5722359165514339, + "flos": 37418134178880.0, + "grad_norm": 2.2744113859782114, + "language_loss": 0.65799785, + "learning_rate": 1.6313299465469857e-06, + "loss": 0.68592793, + "num_input_tokens_seen": 102758415, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.47070312, + "step": 4759, + "time_per_iteration": 3.148474931716919 + }, + { + "auxiliary_loss_clip": 0.01513491, + "auxiliary_loss_mlp": 0.01280385, + "balance_loss_clip": 1.15154672, + "balance_loss_mlp": 1.03357458, + "epoch": 0.572356159442073, + "flos": 21974581523520.0, + "grad_norm": 3.2266632560024364, + "language_loss": 0.79440427, + "learning_rate": 1.6305643517546014e-06, + "loss": 0.82234305, + "num_input_tokens_seen": 102773795, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.46484375, + "step": 4760, + "time_per_iteration": 2.971527099609375 + }, + { + "auxiliary_loss_clip": 0.0151215, + "auxiliary_loss_mlp": 0.01276925, + "balance_loss_clip": 1.15017545, + "balance_loss_mlp": 1.03316569, + "epoch": 0.5724764023327121, + "flos": 19137564666720.0, + "grad_norm": 1.92973128962648, + "language_loss": 0.84829217, + "learning_rate": 1.629798813005311e-06, + "loss": 0.87618291, + "num_input_tokens_seen": 102793515, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.43359375, + "step": 4761, + "time_per_iteration": 4.693648338317871 + }, + { + "auxiliary_loss_clip": 0.01514953, + "auxiliary_loss_mlp": 0.01270985, + "balance_loss_clip": 1.15295267, + "balance_loss_mlp": 1.02398348, + "epoch": 0.5725966452233512, + "flos": 22821942925440.0, + "grad_norm": 2.0806705485565535, + "language_loss": 0.7109074, + "learning_rate": 1.6290333304152473e-06, + "loss": 0.73876673, + "num_input_tokens_seen": 102813390, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.46679688, + "step": 4762, + "time_per_iteration": 2.916224241256714 + }, + { + "auxiliary_loss_clip": 0.01516061, + "auxiliary_loss_mlp": 0.01277357, + "balance_loss_clip": 1.1540513, + "balance_loss_mlp": 1.0322628, + "epoch": 0.5727168881139902, + "flos": 41499624280320.0, + "grad_norm": 1.8862333406572849, + "language_loss": 0.56699175, + "learning_rate": 1.6282679041005314e-06, + "loss": 0.59492594, + "num_input_tokens_seen": 102838980, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.44726562, + "step": 4763, + "time_per_iteration": 3.1683828830718994 + }, + { + "auxiliary_loss_clip": 0.01511903, + "auxiliary_loss_mlp": 0.01285473, + "balance_loss_clip": 1.15074837, + "balance_loss_mlp": 1.03980601, + "epoch": 0.5728371310046293, + "flos": 14649442754400.0, + "grad_norm": 2.2110135491679848, + "language_loss": 0.87252617, + "learning_rate": 1.6275025341772789e-06, + "loss": 0.90049988, + "num_input_tokens_seen": 102855285, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.453125, + "step": 4764, + "time_per_iteration": 3.9052622318267822 + }, + { + "auxiliary_loss_clip": 0.01516849, + "auxiliary_loss_mlp": 0.01284705, + "balance_loss_clip": 1.15465546, + "balance_loss_mlp": 1.03827596, + "epoch": 0.5729573738952685, + "flos": 21508743126240.0, + "grad_norm": 2.16889242178884, + "language_loss": 0.81715703, + "learning_rate": 1.626737220761596e-06, + "loss": 0.84517258, + "num_input_tokens_seen": 102872750, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.4609375, + "step": 4765, + "time_per_iteration": 2.9934966564178467 + }, + { + "auxiliary_loss_clip": 0.01518526, + "auxiliary_loss_mlp": 0.01287513, + "balance_loss_clip": 1.15728164, + "balance_loss_mlp": 1.04375422, + "epoch": 0.5730776167859075, + "flos": 23623866027360.0, + "grad_norm": 1.9332518758995918, + "language_loss": 0.79045027, + "learning_rate": 1.62597196396958e-06, + "loss": 0.81851065, + "num_input_tokens_seen": 102890920, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.43359375, + "step": 4766, + "time_per_iteration": 3.0628292560577393 + }, + { + "auxiliary_loss_clip": 0.01512652, + "auxiliary_loss_mlp": 0.0128497, + "balance_loss_clip": 1.15126467, + "balance_loss_mlp": 1.03834975, + "epoch": 0.5731978596765466, + "flos": 25741605971520.0, + "grad_norm": 2.179060572424277, + "language_loss": 0.85722351, + "learning_rate": 1.6252067639173197e-06, + "loss": 0.88519979, + "num_input_tokens_seen": 102912830, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.46289062, + "step": 4767, + "time_per_iteration": 3.082632541656494 + }, + { + "auxiliary_loss_clip": 0.01510543, + "auxiliary_loss_mlp": 0.01279224, + "balance_loss_clip": 1.14763677, + "balance_loss_mlp": 1.03432083, + "epoch": 0.5733181025671857, + "flos": 26361927221760.0, + "grad_norm": 1.8283449526937483, + "language_loss": 0.69944203, + "learning_rate": 1.6244416207208956e-06, + "loss": 0.72733974, + "num_input_tokens_seen": 102933765, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.4453125, + "step": 4768, + "time_per_iteration": 3.0939671993255615 + }, + { + "auxiliary_loss_clip": 0.01519232, + "auxiliary_loss_mlp": 0.01278475, + "balance_loss_clip": 1.15807354, + "balance_loss_mlp": 1.03433418, + "epoch": 0.5734383454578248, + "flos": 29426439368160.0, + "grad_norm": 3.0809631788845073, + "language_loss": 0.73883766, + "learning_rate": 1.6236765344963787e-06, + "loss": 0.76681471, + "num_input_tokens_seen": 102955025, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.4375, + "step": 4769, + "time_per_iteration": 3.096672773361206 + }, + { + "auxiliary_loss_clip": 0.01508147, + "auxiliary_loss_mlp": 0.01272141, + "balance_loss_clip": 1.14668465, + "balance_loss_mlp": 1.0262835, + "epoch": 0.5735585883484638, + "flos": 34972843366080.0, + "grad_norm": 4.511490046559759, + "language_loss": 0.69260818, + "learning_rate": 1.6229115053598322e-06, + "loss": 0.72041106, + "num_input_tokens_seen": 102976780, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.45507812, + "step": 4770, + "time_per_iteration": 3.827789306640625 + }, + { + "auxiliary_loss_clip": 0.01523596, + "auxiliary_loss_mlp": 0.01285077, + "balance_loss_clip": 1.16163039, + "balance_loss_mlp": 1.03883862, + "epoch": 0.573678831239103, + "flos": 18772767980640.0, + "grad_norm": 2.739780471534465, + "language_loss": 0.72304761, + "learning_rate": 1.6221465334273108e-06, + "loss": 0.7511344, + "num_input_tokens_seen": 102995990, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.45898438, + "step": 4771, + "time_per_iteration": 2.978501081466675 + }, + { + "auxiliary_loss_clip": 0.0151149, + "auxiliary_loss_mlp": 0.01279512, + "balance_loss_clip": 1.14903736, + "balance_loss_mlp": 1.0349896, + "epoch": 0.5737990741297421, + "flos": 25705270429920.0, + "grad_norm": 2.3208466098143457, + "language_loss": 0.61748302, + "learning_rate": 1.6213816188148593e-06, + "loss": 0.64539301, + "num_input_tokens_seen": 103014695, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.44140625, + "step": 4772, + "time_per_iteration": 3.046764373779297 + }, + { + "auxiliary_loss_clip": 0.01518315, + "auxiliary_loss_mlp": 0.0127418, + "balance_loss_clip": 1.1562773, + "balance_loss_mlp": 1.03290045, + "epoch": 0.5739193170203811, + "flos": 27271567391040.0, + "grad_norm": 2.0847601200377954, + "language_loss": 0.7714237, + "learning_rate": 1.6206167616385162e-06, + "loss": 0.79934859, + "num_input_tokens_seen": 103035760, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.40820312, + "step": 4773, + "time_per_iteration": 2.989631175994873 + }, + { + "auxiliary_loss_clip": 0.01520402, + "auxiliary_loss_mlp": 0.01292042, + "balance_loss_clip": 1.15762234, + "balance_loss_mlp": 1.0454216, + "epoch": 0.5740395599110203, + "flos": 12241359830880.0, + "grad_norm": 2.007346648448119, + "language_loss": 0.73743212, + "learning_rate": 1.6198519620143078e-06, + "loss": 0.76555657, + "num_input_tokens_seen": 103052915, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.46289062, + "step": 4774, + "time_per_iteration": 3.045499563217163 + }, + { + "auxiliary_loss_clip": 0.01518345, + "auxiliary_loss_mlp": 0.01289109, + "balance_loss_clip": 1.15676868, + "balance_loss_mlp": 1.04287004, + "epoch": 0.5741598028016593, + "flos": 25923776745600.0, + "grad_norm": 1.723064944953185, + "language_loss": 0.77993989, + "learning_rate": 1.6190872200582546e-06, + "loss": 0.80801445, + "num_input_tokens_seen": 103074655, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.45898438, + "step": 4775, + "time_per_iteration": 3.013631582260132 + }, + { + "auxiliary_loss_clip": 0.01528127, + "auxiliary_loss_mlp": 0.01288857, + "balance_loss_clip": 1.16881764, + "balance_loss_mlp": 1.04299927, + "epoch": 0.5742800456922984, + "flos": 19246116153600.0, + "grad_norm": 13.023824767426536, + "language_loss": 0.78199506, + "learning_rate": 1.6183225358863676e-06, + "loss": 0.81016493, + "num_input_tokens_seen": 103091550, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.45507812, + "step": 4776, + "time_per_iteration": 2.9351789951324463 + }, + { + "auxiliary_loss_clip": 0.01517737, + "auxiliary_loss_mlp": 0.01287024, + "balance_loss_clip": 1.15653276, + "balance_loss_mlp": 1.0404036, + "epoch": 0.5744002885829376, + "flos": 30923175355200.0, + "grad_norm": 2.820854379464381, + "language_loss": 0.72225249, + "learning_rate": 1.617557909614648e-06, + "loss": 0.75030005, + "num_input_tokens_seen": 103110985, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.46289062, + "step": 4777, + "time_per_iteration": 3.044795513153076 + }, + { + "auxiliary_loss_clip": 0.01514539, + "auxiliary_loss_mlp": 0.01278672, + "balance_loss_clip": 1.15282845, + "balance_loss_mlp": 1.03395915, + "epoch": 0.5745205314735766, + "flos": 23842182702240.0, + "grad_norm": 1.837623064260858, + "language_loss": 0.86025274, + "learning_rate": 1.6167933413590899e-06, + "loss": 0.88818479, + "num_input_tokens_seen": 103129890, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.44335938, + "step": 4778, + "time_per_iteration": 3.0635809898376465 + }, + { + "auxiliary_loss_clip": 0.01521015, + "auxiliary_loss_mlp": 0.01284247, + "balance_loss_clip": 1.15926504, + "balance_loss_mlp": 1.04029727, + "epoch": 0.5746407743642157, + "flos": 12313499919840.0, + "grad_norm": 2.273475577984303, + "language_loss": 0.9053992, + "learning_rate": 1.6160288312356773e-06, + "loss": 0.93345183, + "num_input_tokens_seen": 103147020, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.43554688, + "step": 4779, + "time_per_iteration": 2.988644599914551 + }, + { + "auxiliary_loss_clip": 0.01518358, + "auxiliary_loss_mlp": 0.012823, + "balance_loss_clip": 1.15725124, + "balance_loss_mlp": 1.03548932, + "epoch": 0.5747610172548548, + "flos": 24135939216000.0, + "grad_norm": 1.8298435318993105, + "language_loss": 0.81686831, + "learning_rate": 1.6152643793603857e-06, + "loss": 0.84487486, + "num_input_tokens_seen": 103167370, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.46484375, + "step": 4780, + "time_per_iteration": 3.0614407062530518 + }, + { + "auxiliary_loss_clip": 0.01520844, + "auxiliary_loss_mlp": 0.01280727, + "balance_loss_clip": 1.16029525, + "balance_loss_mlp": 1.03448868, + "epoch": 0.5748812601454939, + "flos": 25410679496640.0, + "grad_norm": 1.9622367521830246, + "language_loss": 0.87894845, + "learning_rate": 1.6144999858491815e-06, + "loss": 0.90696418, + "num_input_tokens_seen": 103186000, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.45898438, + "step": 4781, + "time_per_iteration": 2.980112314224243 + }, + { + "auxiliary_loss_clip": 0.01514125, + "auxiliary_loss_mlp": 0.01278207, + "balance_loss_clip": 1.15110707, + "balance_loss_mlp": 1.02986979, + "epoch": 0.575001503036133, + "flos": 30627408648960.0, + "grad_norm": 1.7097596566815176, + "language_loss": 0.85986948, + "learning_rate": 1.6137356508180232e-06, + "loss": 0.88779283, + "num_input_tokens_seen": 103207710, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.48046875, + "step": 4782, + "time_per_iteration": 3.0762405395507812 + }, + { + "auxiliary_loss_clip": 0.0151736, + "auxiliary_loss_mlp": 0.01287878, + "balance_loss_clip": 1.15679526, + "balance_loss_mlp": 1.0416398, + "epoch": 0.5751217459267721, + "flos": 21728766568320.0, + "grad_norm": 1.9958130708207293, + "language_loss": 0.81518382, + "learning_rate": 1.6129713743828593e-06, + "loss": 0.84323621, + "num_input_tokens_seen": 103226720, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.45898438, + "step": 4783, + "time_per_iteration": 2.9940178394317627 + }, + { + "auxiliary_loss_clip": 0.01517894, + "auxiliary_loss_mlp": 0.01273259, + "balance_loss_clip": 1.15679729, + "balance_loss_mlp": 1.02930951, + "epoch": 0.5752419888174112, + "flos": 21653706011040.0, + "grad_norm": 1.5463757924254906, + "language_loss": 0.75362873, + "learning_rate": 1.6122071566596306e-06, + "loss": 0.78154022, + "num_input_tokens_seen": 103246995, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.43554688, + "step": 4784, + "time_per_iteration": 3.0223424434661865 + }, + { + "auxiliary_loss_clip": 0.01518151, + "auxiliary_loss_mlp": 0.01269822, + "balance_loss_clip": 1.15729797, + "balance_loss_mlp": 1.02587247, + "epoch": 0.5753622317080502, + "flos": 17777409441120.0, + "grad_norm": 2.310965044369572, + "language_loss": 0.83596933, + "learning_rate": 1.6114429977642674e-06, + "loss": 0.86384904, + "num_input_tokens_seen": 103261500, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.43554688, + "step": 4785, + "time_per_iteration": 3.1801748275756836 + }, + { + "auxiliary_loss_clip": 0.0152605, + "auxiliary_loss_mlp": 0.01287258, + "balance_loss_clip": 1.16451204, + "balance_loss_mlp": 1.04178238, + "epoch": 0.5754824745986894, + "flos": 19791528559200.0, + "grad_norm": 1.7548790089130522, + "language_loss": 0.73990488, + "learning_rate": 1.6106788978126926e-06, + "loss": 0.76803792, + "num_input_tokens_seen": 103280475, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.45117188, + "step": 4786, + "time_per_iteration": 3.0407614707946777 + }, + { + "auxiliary_loss_clip": 0.01516148, + "auxiliary_loss_mlp": 0.01276691, + "balance_loss_clip": 1.15467048, + "balance_loss_mlp": 1.03083348, + "epoch": 0.5756027174893285, + "flos": 30987919452960.0, + "grad_norm": 2.3531982081065737, + "language_loss": 0.78941399, + "learning_rate": 1.6099148569208196e-06, + "loss": 0.8173424, + "num_input_tokens_seen": 103297695, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.45507812, + "step": 4787, + "time_per_iteration": 3.0884110927581787 + }, + { + "auxiliary_loss_clip": 0.01522798, + "auxiliary_loss_mlp": 0.01281427, + "balance_loss_clip": 1.16254735, + "balance_loss_mlp": 1.03442574, + "epoch": 0.5757229603799675, + "flos": 28549303996320.0, + "grad_norm": 2.026758457680145, + "language_loss": 0.63587201, + "learning_rate": 1.6091508752045523e-06, + "loss": 0.66391426, + "num_input_tokens_seen": 103318575, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.46679688, + "step": 4788, + "time_per_iteration": 3.9933154582977295 + }, + { + "auxiliary_loss_clip": 0.01515109, + "auxiliary_loss_mlp": 0.01273698, + "balance_loss_clip": 1.15412939, + "balance_loss_mlp": 1.02803123, + "epoch": 0.5758432032706067, + "flos": 23001041518560.0, + "grad_norm": 1.681894404297685, + "language_loss": 0.86627889, + "learning_rate": 1.608386952779787e-06, + "loss": 0.89416695, + "num_input_tokens_seen": 103337945, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.453125, + "step": 4789, + "time_per_iteration": 3.907787561416626 + }, + { + "auxiliary_loss_clip": 0.01524201, + "auxiliary_loss_mlp": 0.0128154, + "balance_loss_clip": 1.16241014, + "balance_loss_mlp": 1.03797197, + "epoch": 0.5759634461612457, + "flos": 25742136965760.0, + "grad_norm": 1.6742030061616469, + "language_loss": 0.74696296, + "learning_rate": 1.6076230897624098e-06, + "loss": 0.77502036, + "num_input_tokens_seen": 103360150, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.43164062, + "step": 4790, + "time_per_iteration": 3.1646981239318848 + }, + { + "auxiliary_loss_clip": 0.01520628, + "auxiliary_loss_mlp": 0.01285438, + "balance_loss_clip": 1.15816987, + "balance_loss_mlp": 1.04148793, + "epoch": 0.5760836890518848, + "flos": 30594410785440.0, + "grad_norm": 2.6992390423399715, + "language_loss": 0.77378803, + "learning_rate": 1.6068592862682974e-06, + "loss": 0.80184871, + "num_input_tokens_seen": 103378305, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.43554688, + "step": 4791, + "time_per_iteration": 3.902940034866333 + }, + { + "auxiliary_loss_clip": 0.01521784, + "auxiliary_loss_mlp": 0.01284579, + "balance_loss_clip": 1.16096663, + "balance_loss_mlp": 1.03853071, + "epoch": 0.576203931942524, + "flos": 36541947011040.0, + "grad_norm": 1.960308536775931, + "language_loss": 0.74129212, + "learning_rate": 1.6060955424133187e-06, + "loss": 0.76935577, + "num_input_tokens_seen": 103399230, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.45703125, + "step": 4792, + "time_per_iteration": 3.1372904777526855 + }, + { + "auxiliary_loss_clip": 0.01519675, + "auxiliary_loss_mlp": 0.01274266, + "balance_loss_clip": 1.1593399, + "balance_loss_mlp": 1.02821803, + "epoch": 0.576324174833163, + "flos": 25519117199040.0, + "grad_norm": 2.4398688658127607, + "language_loss": 0.89146256, + "learning_rate": 1.6053318583133332e-06, + "loss": 0.919402, + "num_input_tokens_seen": 103420100, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.45703125, + "step": 4793, + "time_per_iteration": 2.9807205200195312 + }, + { + "auxiliary_loss_clip": 0.01520424, + "auxiliary_loss_mlp": 0.012858, + "balance_loss_clip": 1.15875149, + "balance_loss_mlp": 1.03860807, + "epoch": 0.5764444177238021, + "flos": 25121626074720.0, + "grad_norm": 2.379595851304533, + "language_loss": 0.74869317, + "learning_rate": 1.6045682340841907e-06, + "loss": 0.77675539, + "num_input_tokens_seen": 103439025, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.46875, + "step": 4794, + "time_per_iteration": 3.0084235668182373 + }, + { + "auxiliary_loss_clip": 0.01525242, + "auxiliary_loss_mlp": 0.01204384, + "balance_loss_clip": 1.17075801, + "balance_loss_mlp": 1.00525665, + "epoch": 0.5765646606144411, + "flos": 62218467531360.0, + "grad_norm": 0.7645015865054188, + "language_loss": 0.58007669, + "learning_rate": 1.6038046698417336e-06, + "loss": 0.60737288, + "num_input_tokens_seen": 103499920, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 1.98828125, + "step": 4795, + "time_per_iteration": 3.4464495182037354 + }, + { + "auxiliary_loss_clip": 0.01519623, + "auxiliary_loss_mlp": 0.01276638, + "balance_loss_clip": 1.15763164, + "balance_loss_mlp": 1.03306949, + "epoch": 0.5766849035050803, + "flos": 25121095080480.0, + "grad_norm": 2.084312334645415, + "language_loss": 0.69451094, + "learning_rate": 1.6030411657017919e-06, + "loss": 0.7224735, + "num_input_tokens_seen": 103519575, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.43164062, + "step": 4796, + "time_per_iteration": 2.967688798904419 + }, + { + "auxiliary_loss_clip": 0.01520411, + "auxiliary_loss_mlp": 0.01276793, + "balance_loss_clip": 1.15970349, + "balance_loss_mlp": 1.0353229, + "epoch": 0.5768051463957193, + "flos": 15992568236160.0, + "grad_norm": 1.8251311416693419, + "language_loss": 0.84521216, + "learning_rate": 1.6022777217801903e-06, + "loss": 0.8731842, + "num_input_tokens_seen": 103536530, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.41015625, + "step": 4797, + "time_per_iteration": 3.787968873977661 + }, + { + "auxiliary_loss_clip": 0.01514718, + "auxiliary_loss_mlp": 0.01283502, + "balance_loss_clip": 1.15232742, + "balance_loss_mlp": 1.03554654, + "epoch": 0.5769253892863584, + "flos": 22165817127840.0, + "grad_norm": 2.1536398761909292, + "language_loss": 0.73935616, + "learning_rate": 1.601514338192742e-06, + "loss": 0.7673384, + "num_input_tokens_seen": 103556460, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.4765625, + "step": 4798, + "time_per_iteration": 2.956238031387329 + }, + { + "auxiliary_loss_clip": 0.01513045, + "auxiliary_loss_mlp": 0.01272679, + "balance_loss_clip": 1.15089953, + "balance_loss_mlp": 1.02987337, + "epoch": 0.5770456321769976, + "flos": 22858581892320.0, + "grad_norm": 2.59599289308069, + "language_loss": 0.71693009, + "learning_rate": 1.6007510150552514e-06, + "loss": 0.74478728, + "num_input_tokens_seen": 103574520, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.42382812, + "step": 4799, + "time_per_iteration": 3.02624249458313 + }, + { + "auxiliary_loss_clip": 0.01519899, + "auxiliary_loss_mlp": 0.01286923, + "balance_loss_clip": 1.15755177, + "balance_loss_mlp": 1.03973055, + "epoch": 0.5771658750676366, + "flos": 46357208042400.0, + "grad_norm": 2.7931847919446087, + "language_loss": 0.62292731, + "learning_rate": 1.599987752483515e-06, + "loss": 0.65099549, + "num_input_tokens_seen": 103598965, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.46875, + "step": 4800, + "time_per_iteration": 3.159646511077881 + }, + { + "auxiliary_loss_clip": 0.01522768, + "auxiliary_loss_mlp": 0.0127577, + "balance_loss_clip": 1.16227221, + "balance_loss_mlp": 1.02972186, + "epoch": 0.5772861179582757, + "flos": 22161948455520.0, + "grad_norm": 1.6728818626840842, + "language_loss": 0.67784047, + "learning_rate": 1.5992245505933184e-06, + "loss": 0.70582581, + "num_input_tokens_seen": 103618665, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.45703125, + "step": 4801, + "time_per_iteration": 3.123483896255493 + }, + { + "auxiliary_loss_clip": 0.01521341, + "auxiliary_loss_mlp": 0.01272686, + "balance_loss_clip": 1.16009486, + "balance_loss_mlp": 1.02835417, + "epoch": 0.5774063608489148, + "flos": 31251636499680.0, + "grad_norm": 3.3189624150503634, + "language_loss": 0.70868778, + "learning_rate": 1.5984614095004388e-06, + "loss": 0.73662806, + "num_input_tokens_seen": 103639800, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.43945312, + "step": 4802, + "time_per_iteration": 3.1655123233795166 + }, + { + "auxiliary_loss_clip": 0.01525389, + "auxiliary_loss_mlp": 0.01282084, + "balance_loss_clip": 1.16508055, + "balance_loss_mlp": 1.03622711, + "epoch": 0.5775266037395539, + "flos": 22529248400160.0, + "grad_norm": 2.236013767526702, + "language_loss": 0.80898482, + "learning_rate": 1.5976983293206438e-06, + "loss": 0.83705956, + "num_input_tokens_seen": 103655605, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.45507812, + "step": 4803, + "time_per_iteration": 3.0432164669036865 + }, + { + "auxiliary_loss_clip": 0.01515667, + "auxiliary_loss_mlp": 0.01281846, + "balance_loss_clip": 1.15437484, + "balance_loss_mlp": 1.03770518, + "epoch": 0.577646846630193, + "flos": 21070896075360.0, + "grad_norm": 2.004102134718965, + "language_loss": 0.71269214, + "learning_rate": 1.5969353101696928e-06, + "loss": 0.74066728, + "num_input_tokens_seen": 103674045, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.4375, + "step": 4804, + "time_per_iteration": 2.993311882019043 + }, + { + "auxiliary_loss_clip": 0.01520023, + "auxiliary_loss_mlp": 0.01277562, + "balance_loss_clip": 1.15876055, + "balance_loss_mlp": 1.03399348, + "epoch": 0.5777670895208321, + "flos": 29716782347520.0, + "grad_norm": 1.8512356315007377, + "language_loss": 0.79792929, + "learning_rate": 1.5961723521633341e-06, + "loss": 0.8259052, + "num_input_tokens_seen": 103695285, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.43164062, + "step": 4805, + "time_per_iteration": 3.142500877380371 + }, + { + "auxiliary_loss_clip": 0.01514304, + "auxiliary_loss_mlp": 0.01277257, + "balance_loss_clip": 1.15237725, + "balance_loss_mlp": 1.03120923, + "epoch": 0.5778873324114712, + "flos": 19502475137280.0, + "grad_norm": 3.177226276063564, + "language_loss": 0.91050148, + "learning_rate": 1.5954094554173097e-06, + "loss": 0.93841714, + "num_input_tokens_seen": 103713275, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.45703125, + "step": 4806, + "time_per_iteration": 3.0091946125030518 + }, + { + "auxiliary_loss_clip": 0.01519906, + "auxiliary_loss_mlp": 0.01271067, + "balance_loss_clip": 1.15849304, + "balance_loss_mlp": 1.02749896, + "epoch": 0.5780075753021102, + "flos": 14138583266880.0, + "grad_norm": 2.3384010659341614, + "language_loss": 0.79146767, + "learning_rate": 1.5946466200473482e-06, + "loss": 0.81937742, + "num_input_tokens_seen": 103731185, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.43164062, + "step": 4807, + "time_per_iteration": 3.054598331451416 + }, + { + "auxiliary_loss_clip": 0.01517031, + "auxiliary_loss_mlp": 0.01281555, + "balance_loss_clip": 1.15365791, + "balance_loss_mlp": 1.03836823, + "epoch": 0.5781278181927494, + "flos": 15263657570880.0, + "grad_norm": 1.813217823756258, + "language_loss": 0.83185434, + "learning_rate": 1.5938838461691723e-06, + "loss": 0.85984027, + "num_input_tokens_seen": 103748095, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.42773438, + "step": 4808, + "time_per_iteration": 3.014702081680298 + }, + { + "auxiliary_loss_clip": 0.01516378, + "auxiliary_loss_mlp": 0.01289772, + "balance_loss_clip": 1.15459645, + "balance_loss_mlp": 1.04505944, + "epoch": 0.5782480610833884, + "flos": 16728685251840.0, + "grad_norm": 4.09081823928004, + "language_loss": 0.83374035, + "learning_rate": 1.593121133898494e-06, + "loss": 0.86180186, + "num_input_tokens_seen": 103765300, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.44335938, + "step": 4809, + "time_per_iteration": 3.0343713760375977 + }, + { + "auxiliary_loss_clip": 0.01519981, + "auxiliary_loss_mlp": 0.01276515, + "balance_loss_clip": 1.15907705, + "balance_loss_mlp": 1.03161168, + "epoch": 0.5783683039740275, + "flos": 25484412568320.0, + "grad_norm": 2.166944573501622, + "language_loss": 0.7997663, + "learning_rate": 1.592358483351016e-06, + "loss": 0.82773131, + "num_input_tokens_seen": 103785475, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.4453125, + "step": 4810, + "time_per_iteration": 2.9765937328338623 + }, + { + "auxiliary_loss_clip": 0.01514147, + "auxiliary_loss_mlp": 0.01273213, + "balance_loss_clip": 1.1514256, + "balance_loss_mlp": 1.02850032, + "epoch": 0.5784885468646667, + "flos": 18407819581920.0, + "grad_norm": 2.0782861448689065, + "language_loss": 0.72361112, + "learning_rate": 1.5915958946424326e-06, + "loss": 0.75148469, + "num_input_tokens_seen": 103804160, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.44335938, + "step": 4811, + "time_per_iteration": 2.928096294403076 + }, + { + "auxiliary_loss_clip": 0.01517303, + "auxiliary_loss_mlp": 0.01271614, + "balance_loss_clip": 1.15562701, + "balance_loss_mlp": 1.02632904, + "epoch": 0.5786087897553057, + "flos": 46104376377600.0, + "grad_norm": 1.6302585269492826, + "language_loss": 0.7455132, + "learning_rate": 1.5908333678884271e-06, + "loss": 0.77340239, + "num_input_tokens_seen": 103830580, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.44921875, + "step": 4812, + "time_per_iteration": 3.1813879013061523 + }, + { + "auxiliary_loss_clip": 0.01519639, + "auxiliary_loss_mlp": 0.01279254, + "balance_loss_clip": 1.15737724, + "balance_loss_mlp": 1.03206182, + "epoch": 0.5787290326459448, + "flos": 12387612273120.0, + "grad_norm": 2.314101083731656, + "language_loss": 0.74142623, + "learning_rate": 1.5900709032046743e-06, + "loss": 0.76941514, + "num_input_tokens_seen": 103848655, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.46875, + "step": 4813, + "time_per_iteration": 3.0374717712402344 + }, + { + "auxiliary_loss_clip": 0.01517214, + "auxiliary_loss_mlp": 0.01268657, + "balance_loss_clip": 1.1552701, + "balance_loss_mlp": 1.02241826, + "epoch": 0.5788492755365839, + "flos": 23292332701920.0, + "grad_norm": 2.01966037815146, + "language_loss": 0.78217578, + "learning_rate": 1.5893085007068391e-06, + "loss": 0.81003445, + "num_input_tokens_seen": 103866215, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.45898438, + "step": 4814, + "time_per_iteration": 3.0505645275115967 + }, + { + "auxiliary_loss_clip": 0.01521439, + "auxiliary_loss_mlp": 0.01273086, + "balance_loss_clip": 1.16032529, + "balance_loss_mlp": 1.02665675, + "epoch": 0.578969518427223, + "flos": 24063343989120.0, + "grad_norm": 1.8985928098516123, + "language_loss": 0.70980632, + "learning_rate": 1.5885461605105786e-06, + "loss": 0.7377516, + "num_input_tokens_seen": 103887815, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.4609375, + "step": 4815, + "time_per_iteration": 3.074082136154175 + }, + { + "auxiliary_loss_clip": 0.01516262, + "auxiliary_loss_mlp": 0.01278455, + "balance_loss_clip": 1.15452123, + "balance_loss_mlp": 1.03030849, + "epoch": 0.579089761317862, + "flos": 21873994950240.0, + "grad_norm": 2.2737555112440138, + "language_loss": 0.77452743, + "learning_rate": 1.5877838827315375e-06, + "loss": 0.80247462, + "num_input_tokens_seen": 103906360, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.47851562, + "step": 4816, + "time_per_iteration": 4.744425535202026 + }, + { + "auxiliary_loss_clip": 0.01524048, + "auxiliary_loss_mlp": 0.01275247, + "balance_loss_clip": 1.16427255, + "balance_loss_mlp": 1.02748263, + "epoch": 0.5792100042085012, + "flos": 22931897754240.0, + "grad_norm": 1.9528437220683652, + "language_loss": 0.70154321, + "learning_rate": 1.587021667485355e-06, + "loss": 0.72953612, + "num_input_tokens_seen": 103925730, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.47460938, + "step": 4817, + "time_per_iteration": 3.0269393920898438 + }, + { + "auxiliary_loss_clip": 0.01513239, + "auxiliary_loss_mlp": 0.01277634, + "balance_loss_clip": 1.15137625, + "balance_loss_mlp": 1.03292131, + "epoch": 0.5793302470991403, + "flos": 21472180015680.0, + "grad_norm": 1.712277424130031, + "language_loss": 0.78373551, + "learning_rate": 1.5862595148876559e-06, + "loss": 0.8116442, + "num_input_tokens_seen": 103945835, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.44335938, + "step": 4818, + "time_per_iteration": 3.0625617504119873 + }, + { + "auxiliary_loss_clip": 0.01521115, + "auxiliary_loss_mlp": 0.01278619, + "balance_loss_clip": 1.16140997, + "balance_loss_mlp": 1.03028178, + "epoch": 0.5794504899897793, + "flos": 12712508170560.0, + "grad_norm": 1.998716319851918, + "language_loss": 0.76092887, + "learning_rate": 1.58549742505406e-06, + "loss": 0.78892624, + "num_input_tokens_seen": 103960580, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.48046875, + "step": 4819, + "time_per_iteration": 3.834719181060791 + }, + { + "auxiliary_loss_clip": 0.01519312, + "auxiliary_loss_mlp": 0.01275609, + "balance_loss_clip": 1.15712595, + "balance_loss_mlp": 1.03356636, + "epoch": 0.5795707328804185, + "flos": 14868859345920.0, + "grad_norm": 2.150259640335265, + "language_loss": 0.75720996, + "learning_rate": 1.5847353981001747e-06, + "loss": 0.78515923, + "num_input_tokens_seen": 103977760, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.41601562, + "step": 4820, + "time_per_iteration": 2.991600751876831 + }, + { + "auxiliary_loss_clip": 0.01513601, + "auxiliary_loss_mlp": 0.01264124, + "balance_loss_clip": 1.1509006, + "balance_loss_mlp": 1.01826715, + "epoch": 0.5796909757710575, + "flos": 36433054170720.0, + "grad_norm": 2.398685116948902, + "language_loss": 0.69874853, + "learning_rate": 1.5839734341415993e-06, + "loss": 0.72652572, + "num_input_tokens_seen": 103999960, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.45507812, + "step": 4821, + "time_per_iteration": 3.107736825942993 + }, + { + "auxiliary_loss_clip": 0.01513328, + "auxiliary_loss_mlp": 0.01270828, + "balance_loss_clip": 1.15020061, + "balance_loss_mlp": 1.02783167, + "epoch": 0.5798112186616966, + "flos": 23042156008320.0, + "grad_norm": 1.7055590572499097, + "language_loss": 0.76317203, + "learning_rate": 1.5832115332939238e-06, + "loss": 0.7910136, + "num_input_tokens_seen": 104018400, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.42578125, + "step": 4822, + "time_per_iteration": 3.0085198879241943 + }, + { + "auxiliary_loss_clip": 0.01518272, + "auxiliary_loss_mlp": 0.01273937, + "balance_loss_clip": 1.15848756, + "balance_loss_mlp": 1.02579117, + "epoch": 0.5799314615523358, + "flos": 16654383257760.0, + "grad_norm": 2.511751239485415, + "language_loss": 0.75000906, + "learning_rate": 1.5824496956727272e-06, + "loss": 0.77793121, + "num_input_tokens_seen": 104035605, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.47851562, + "step": 4823, + "time_per_iteration": 2.9995017051696777 + }, + { + "auxiliary_loss_clip": 0.01510577, + "auxiliary_loss_mlp": 0.01270201, + "balance_loss_clip": 1.14934099, + "balance_loss_mlp": 1.0254879, + "epoch": 0.5800517044429748, + "flos": 20487782714400.0, + "grad_norm": 1.7628148636018621, + "language_loss": 0.72892338, + "learning_rate": 1.5816879213935797e-06, + "loss": 0.75673115, + "num_input_tokens_seen": 104054415, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.44335938, + "step": 4824, + "time_per_iteration": 3.1257495880126953 + }, + { + "auxiliary_loss_clip": 0.01514716, + "auxiliary_loss_mlp": 0.01267335, + "balance_loss_clip": 1.15309179, + "balance_loss_mlp": 1.02491152, + "epoch": 0.5801719473336139, + "flos": 31540841634240.0, + "grad_norm": 4.284930613250234, + "language_loss": 0.79798758, + "learning_rate": 1.5809262105720416e-06, + "loss": 0.82580817, + "num_input_tokens_seen": 104075455, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.41992188, + "step": 4825, + "time_per_iteration": 4.039155006408691 + }, + { + "auxiliary_loss_clip": 0.01515463, + "auxiliary_loss_mlp": 0.01272131, + "balance_loss_clip": 1.15418506, + "balance_loss_mlp": 1.0272274, + "epoch": 0.580292190224253, + "flos": 20378055454560.0, + "grad_norm": 1.6477613270383584, + "language_loss": 0.79417372, + "learning_rate": 1.5801645633236644e-06, + "loss": 0.82204968, + "num_input_tokens_seen": 104096440, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.4453125, + "step": 4826, + "time_per_iteration": 3.1011998653411865 + }, + { + "auxiliary_loss_clip": 0.01509504, + "auxiliary_loss_mlp": 0.01277145, + "balance_loss_clip": 1.14668345, + "balance_loss_mlp": 1.03224134, + "epoch": 0.5804124331148921, + "flos": 26617831067520.0, + "grad_norm": 2.1591032782289834, + "language_loss": 0.774809, + "learning_rate": 1.579402979763989e-06, + "loss": 0.80267543, + "num_input_tokens_seen": 104116775, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.4453125, + "step": 4827, + "time_per_iteration": 3.0751681327819824 + }, + { + "auxiliary_loss_clip": 0.01519414, + "auxiliary_loss_mlp": 0.012655, + "balance_loss_clip": 1.15790796, + "balance_loss_mlp": 1.01754427, + "epoch": 0.5805326760055312, + "flos": 13480105923360.0, + "grad_norm": 2.501148625892626, + "language_loss": 0.8133598, + "learning_rate": 1.578641460008548e-06, + "loss": 0.84120893, + "num_input_tokens_seen": 104134510, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.4765625, + "step": 4828, + "time_per_iteration": 3.0341391563415527 + }, + { + "auxiliary_loss_clip": 0.01515591, + "auxiliary_loss_mlp": 0.01273105, + "balance_loss_clip": 1.1550281, + "balance_loss_mlp": 1.02915502, + "epoch": 0.5806529188961702, + "flos": 12092869627200.0, + "grad_norm": 2.0208045657016327, + "language_loss": 0.67844254, + "learning_rate": 1.5778800041728613e-06, + "loss": 0.70632952, + "num_input_tokens_seen": 104150800, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.43554688, + "step": 4829, + "time_per_iteration": 2.97021484375 + }, + { + "auxiliary_loss_clip": 0.01514692, + "auxiliary_loss_mlp": 0.01265692, + "balance_loss_clip": 1.15350926, + "balance_loss_mlp": 1.02155113, + "epoch": 0.5807731617868094, + "flos": 26216547127200.0, + "grad_norm": 1.6673058801773704, + "language_loss": 0.66487777, + "learning_rate": 1.577118612372443e-06, + "loss": 0.69268167, + "num_input_tokens_seen": 104172640, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.4375, + "step": 4830, + "time_per_iteration": 3.072366237640381 + }, + { + "auxiliary_loss_clip": 0.01513417, + "auxiliary_loss_mlp": 0.01271629, + "balance_loss_clip": 1.15246725, + "balance_loss_mlp": 1.02462697, + "epoch": 0.5808934046774484, + "flos": 37965215423520.0, + "grad_norm": 5.531236505912879, + "language_loss": 0.70634657, + "learning_rate": 1.5763572847227943e-06, + "loss": 0.73419702, + "num_input_tokens_seen": 104193525, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.46679688, + "step": 4831, + "time_per_iteration": 3.0927507877349854 + }, + { + "auxiliary_loss_clip": 0.01507952, + "auxiliary_loss_mlp": 0.01270128, + "balance_loss_clip": 1.14611053, + "balance_loss_mlp": 1.02579689, + "epoch": 0.5810136475680875, + "flos": 20487972355200.0, + "grad_norm": 1.781715752690166, + "language_loss": 0.81297684, + "learning_rate": 1.5755960213394091e-06, + "loss": 0.84075761, + "num_input_tokens_seen": 104210625, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.43945312, + "step": 4832, + "time_per_iteration": 3.0170130729675293 + }, + { + "auxiliary_loss_clip": 0.015139, + "auxiliary_loss_mlp": 0.0127492, + "balance_loss_clip": 1.15327859, + "balance_loss_mlp": 1.03058887, + "epoch": 0.5811338904587267, + "flos": 17532087552000.0, + "grad_norm": 2.970951889602554, + "language_loss": 0.78299183, + "learning_rate": 1.5748348223377703e-06, + "loss": 0.81087995, + "num_input_tokens_seen": 104228180, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.43945312, + "step": 4833, + "time_per_iteration": 2.9803266525268555 + }, + { + "auxiliary_loss_clip": 0.01509969, + "auxiliary_loss_mlp": 0.01266182, + "balance_loss_clip": 1.14715624, + "balance_loss_mlp": 1.02318573, + "epoch": 0.5812541333493657, + "flos": 19459729736640.0, + "grad_norm": 1.7294559783440833, + "language_loss": 0.77967024, + "learning_rate": 1.5740736878333507e-06, + "loss": 0.8074317, + "num_input_tokens_seen": 104246020, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.42578125, + "step": 4834, + "time_per_iteration": 3.0039761066436768 + }, + { + "auxiliary_loss_clip": 0.01510602, + "auxiliary_loss_mlp": 0.01269028, + "balance_loss_clip": 1.14849913, + "balance_loss_mlp": 1.02240753, + "epoch": 0.5813743762400048, + "flos": 20597054836320.0, + "grad_norm": 2.505092613241366, + "language_loss": 0.77972537, + "learning_rate": 1.5733126179416143e-06, + "loss": 0.80752164, + "num_input_tokens_seen": 104260505, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.46289062, + "step": 4835, + "time_per_iteration": 3.0708887577056885 + }, + { + "auxiliary_loss_clip": 0.01512321, + "auxiliary_loss_mlp": 0.01271061, + "balance_loss_clip": 1.1506083, + "balance_loss_mlp": 1.02615774, + "epoch": 0.5814946191306439, + "flos": 33180795810720.0, + "grad_norm": 2.8315975566610407, + "language_loss": 0.72820652, + "learning_rate": 1.5725516127780137e-06, + "loss": 0.75604033, + "num_input_tokens_seen": 104282640, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.4453125, + "step": 4836, + "time_per_iteration": 3.104834794998169 + }, + { + "auxiliary_loss_clip": 0.01505427, + "auxiliary_loss_mlp": 0.01272415, + "balance_loss_clip": 1.14395583, + "balance_loss_mlp": 1.02445948, + "epoch": 0.581614862021283, + "flos": 16145116752960.0, + "grad_norm": 2.5874265218570747, + "language_loss": 0.89142466, + "learning_rate": 1.5717906724579943e-06, + "loss": 0.9192031, + "num_input_tokens_seen": 104299700, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.4765625, + "step": 4837, + "time_per_iteration": 3.022712230682373 + }, + { + "auxiliary_loss_clip": 0.01509932, + "auxiliary_loss_mlp": 0.01280073, + "balance_loss_clip": 1.14685249, + "balance_loss_mlp": 1.03612351, + "epoch": 0.581735104911922, + "flos": 33805478799360.0, + "grad_norm": 2.336323969435234, + "language_loss": 0.68442583, + "learning_rate": 1.571029797096989e-06, + "loss": 0.71232587, + "num_input_tokens_seen": 104320805, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.43554688, + "step": 4838, + "time_per_iteration": 3.0917699337005615 + }, + { + "auxiliary_loss_clip": 0.01505169, + "auxiliary_loss_mlp": 0.01271986, + "balance_loss_clip": 1.14197969, + "balance_loss_mlp": 1.02860796, + "epoch": 0.5818553478025612, + "flos": 23333409263520.0, + "grad_norm": 5.316535322609378, + "language_loss": 0.78917015, + "learning_rate": 1.570268986810423e-06, + "loss": 0.81694168, + "num_input_tokens_seen": 104340700, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.4296875, + "step": 4839, + "time_per_iteration": 2.965773105621338 + }, + { + "auxiliary_loss_clip": 0.01508632, + "auxiliary_loss_mlp": 0.01274768, + "balance_loss_clip": 1.14603269, + "balance_loss_mlp": 1.03196239, + "epoch": 0.5819755906932003, + "flos": 20998262920320.0, + "grad_norm": 2.080000682502295, + "language_loss": 0.74949551, + "learning_rate": 1.5695082417137096e-06, + "loss": 0.7773295, + "num_input_tokens_seen": 104358575, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.42382812, + "step": 4840, + "time_per_iteration": 3.0267231464385986 + }, + { + "auxiliary_loss_clip": 0.01506108, + "auxiliary_loss_mlp": 0.01269485, + "balance_loss_clip": 1.1441654, + "balance_loss_mlp": 1.02610743, + "epoch": 0.5820958335838393, + "flos": 21433682568960.0, + "grad_norm": 1.6374225325792342, + "language_loss": 0.75296217, + "learning_rate": 1.5687475619222539e-06, + "loss": 0.78071809, + "num_input_tokens_seen": 104378530, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.4296875, + "step": 4841, + "time_per_iteration": 3.0181000232696533 + }, + { + "auxiliary_loss_clip": 0.01509403, + "auxiliary_loss_mlp": 0.01271416, + "balance_loss_clip": 1.14530921, + "balance_loss_mlp": 1.02536833, + "epoch": 0.5822160764744785, + "flos": 17969403608640.0, + "grad_norm": 2.1658212192269626, + "language_loss": 0.73382223, + "learning_rate": 1.5679869475514496e-06, + "loss": 0.76163042, + "num_input_tokens_seen": 104395465, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.45703125, + "step": 4842, + "time_per_iteration": 3.0247340202331543 + }, + { + "auxiliary_loss_clip": 0.01504356, + "auxiliary_loss_mlp": 0.01279646, + "balance_loss_clip": 1.13981545, + "balance_loss_mlp": 1.03073728, + "epoch": 0.5823363193651175, + "flos": 23035784077440.0, + "grad_norm": 2.20050316544517, + "language_loss": 0.81340641, + "learning_rate": 1.567226398716682e-06, + "loss": 0.84124649, + "num_input_tokens_seen": 104415380, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.48632812, + "step": 4843, + "time_per_iteration": 3.0062615871429443 + }, + { + "auxiliary_loss_clip": 0.01507109, + "auxiliary_loss_mlp": 0.01268748, + "balance_loss_clip": 1.1420722, + "balance_loss_mlp": 1.02308202, + "epoch": 0.5824565622557566, + "flos": 32894700785280.0, + "grad_norm": 3.5160505972046225, + "language_loss": 0.62215179, + "learning_rate": 1.566465915533326e-06, + "loss": 0.64991033, + "num_input_tokens_seen": 104437410, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.453125, + "step": 4844, + "time_per_iteration": 4.760842323303223 + }, + { + "auxiliary_loss_clip": 0.01505607, + "auxiliary_loss_mlp": 0.0127938, + "balance_loss_clip": 1.14209628, + "balance_loss_mlp": 1.03333247, + "epoch": 0.5825768051463958, + "flos": 22231471501440.0, + "grad_norm": 1.9591499386461468, + "language_loss": 0.88330317, + "learning_rate": 1.5657054981167458e-06, + "loss": 0.91115296, + "num_input_tokens_seen": 104456305, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.45703125, + "step": 4845, + "time_per_iteration": 2.9947397708892822 + }, + { + "auxiliary_loss_clip": 0.015098, + "auxiliary_loss_mlp": 0.01265011, + "balance_loss_clip": 1.14909613, + "balance_loss_mlp": 1.02201462, + "epoch": 0.5826970480370348, + "flos": 28004081231520.0, + "grad_norm": 1.924359340734683, + "language_loss": 0.67876804, + "learning_rate": 1.5649451465822965e-06, + "loss": 0.70651615, + "num_input_tokens_seen": 104477695, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.42578125, + "step": 4846, + "time_per_iteration": 3.0633723735809326 + }, + { + "auxiliary_loss_clip": 0.01505413, + "auxiliary_loss_mlp": 0.01262381, + "balance_loss_clip": 1.14222383, + "balance_loss_mlp": 1.01976621, + "epoch": 0.5828172909276739, + "flos": 17859979774080.0, + "grad_norm": 1.7330027425689492, + "language_loss": 0.83659708, + "learning_rate": 1.5641848610453218e-06, + "loss": 0.86427498, + "num_input_tokens_seen": 104496355, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.421875, + "step": 4847, + "time_per_iteration": 3.9181792736053467 + }, + { + "auxiliary_loss_clip": 0.0150863, + "auxiliary_loss_mlp": 0.01271541, + "balance_loss_clip": 1.14505792, + "balance_loss_mlp": 1.02740061, + "epoch": 0.582937533818313, + "flos": 19867309751520.0, + "grad_norm": 2.2195806811405605, + "language_loss": 0.86226928, + "learning_rate": 1.563424641621158e-06, + "loss": 0.89007103, + "num_input_tokens_seen": 104515535, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.4375, + "step": 4848, + "time_per_iteration": 3.0090107917785645 + }, + { + "auxiliary_loss_clip": 0.0150744, + "auxiliary_loss_mlp": 0.01271452, + "balance_loss_clip": 1.14335239, + "balance_loss_mlp": 1.02597654, + "epoch": 0.5830577767089521, + "flos": 26873203919040.0, + "grad_norm": 2.07802003745587, + "language_loss": 0.69974184, + "learning_rate": 1.5626644884251282e-06, + "loss": 0.72753078, + "num_input_tokens_seen": 104535055, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.45117188, + "step": 4849, + "time_per_iteration": 2.9820470809936523 + }, + { + "auxiliary_loss_clip": 0.0150729, + "auxiliary_loss_mlp": 0.0126713, + "balance_loss_clip": 1.14530838, + "balance_loss_mlp": 1.02375269, + "epoch": 0.5831780195995911, + "flos": 25300383314400.0, + "grad_norm": 1.6725140961039509, + "language_loss": 0.88151646, + "learning_rate": 1.5619044015725488e-06, + "loss": 0.90926069, + "num_input_tokens_seen": 104554745, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.4296875, + "step": 4850, + "time_per_iteration": 2.9648687839508057 + }, + { + "auxiliary_loss_clip": 0.01512127, + "auxiliary_loss_mlp": 0.01282005, + "balance_loss_clip": 1.14862168, + "balance_loss_mlp": 1.03309655, + "epoch": 0.5832982624902303, + "flos": 14758752804480.0, + "grad_norm": 4.427630160390062, + "language_loss": 0.87717736, + "learning_rate": 1.5611443811787224e-06, + "loss": 0.9051187, + "num_input_tokens_seen": 104568870, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.48632812, + "step": 4851, + "time_per_iteration": 2.9917688369750977 + }, + { + "auxiliary_loss_clip": 0.01509843, + "auxiliary_loss_mlp": 0.0126154, + "balance_loss_clip": 1.14597917, + "balance_loss_mlp": 1.02140474, + "epoch": 0.5834185053808694, + "flos": 20446402727520.0, + "grad_norm": 3.563332049753569, + "language_loss": 0.69208229, + "learning_rate": 1.560384427358945e-06, + "loss": 0.71979612, + "num_input_tokens_seen": 104588415, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.39648438, + "step": 4852, + "time_per_iteration": 3.014436960220337 + }, + { + "auxiliary_loss_clip": 0.01506308, + "auxiliary_loss_mlp": 0.01277629, + "balance_loss_clip": 1.14251614, + "balance_loss_mlp": 1.03081846, + "epoch": 0.5835387482715084, + "flos": 27202954620960.0, + "grad_norm": 1.6327874687888528, + "language_loss": 0.72925365, + "learning_rate": 1.5596245402284998e-06, + "loss": 0.75709295, + "num_input_tokens_seen": 104611940, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.46484375, + "step": 4853, + "time_per_iteration": 3.8720803260803223 + }, + { + "auxiliary_loss_clip": 0.01510364, + "auxiliary_loss_mlp": 0.01279784, + "balance_loss_clip": 1.14672649, + "balance_loss_mlp": 1.03583455, + "epoch": 0.5836589911621476, + "flos": 16656507234720.0, + "grad_norm": 1.8311540764651841, + "language_loss": 0.81893742, + "learning_rate": 1.5588647199026619e-06, + "loss": 0.84683895, + "num_input_tokens_seen": 104629675, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.43554688, + "step": 4854, + "time_per_iteration": 3.0278818607330322 + }, + { + "auxiliary_loss_clip": 0.01513968, + "auxiliary_loss_mlp": 0.01279572, + "balance_loss_clip": 1.14938378, + "balance_loss_mlp": 1.03371501, + "epoch": 0.5837792340527866, + "flos": 20448792201600.0, + "grad_norm": 4.120869963533091, + "language_loss": 0.87648749, + "learning_rate": 1.5581049664966956e-06, + "loss": 0.90442288, + "num_input_tokens_seen": 104647435, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.45507812, + "step": 4855, + "time_per_iteration": 3.1163482666015625 + }, + { + "auxiliary_loss_clip": 0.01531124, + "auxiliary_loss_mlp": 0.01208275, + "balance_loss_clip": 1.17493773, + "balance_loss_mlp": 1.00762177, + "epoch": 0.5838994769434257, + "flos": 66000815320320.0, + "grad_norm": 0.9963675186735097, + "language_loss": 0.65063757, + "learning_rate": 1.5573452801258545e-06, + "loss": 0.67803156, + "num_input_tokens_seen": 104694605, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.00390625, + "step": 4856, + "time_per_iteration": 3.272155284881592 + }, + { + "auxiliary_loss_clip": 0.0150911, + "auxiliary_loss_mlp": 0.01269481, + "balance_loss_clip": 1.14612389, + "balance_loss_mlp": 1.0230515, + "epoch": 0.5840197198340649, + "flos": 21472673081760.0, + "grad_norm": 2.0213040385615573, + "language_loss": 0.63429797, + "learning_rate": 1.5565856609053824e-06, + "loss": 0.66208386, + "num_input_tokens_seen": 104713400, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.4609375, + "step": 4857, + "time_per_iteration": 3.092172861099243 + }, + { + "auxiliary_loss_clip": 0.01512832, + "auxiliary_loss_mlp": 0.01267301, + "balance_loss_clip": 1.1480341, + "balance_loss_mlp": 1.02239764, + "epoch": 0.5841399627247039, + "flos": 19137147456960.0, + "grad_norm": 2.266364082954483, + "language_loss": 0.79868007, + "learning_rate": 1.5558261089505127e-06, + "loss": 0.82648134, + "num_input_tokens_seen": 104732130, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.4453125, + "step": 4858, + "time_per_iteration": 3.021289348602295 + }, + { + "auxiliary_loss_clip": 0.01519513, + "auxiliary_loss_mlp": 0.01282901, + "balance_loss_clip": 1.15636647, + "balance_loss_mlp": 1.03647184, + "epoch": 0.584260205615343, + "flos": 26427733308000.0, + "grad_norm": 1.9696409916764737, + "language_loss": 0.79849505, + "learning_rate": 1.5550666243764697e-06, + "loss": 0.82651925, + "num_input_tokens_seen": 104750290, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.4609375, + "step": 4859, + "time_per_iteration": 3.133918523788452 + }, + { + "auxiliary_loss_clip": 0.01506216, + "auxiliary_loss_mlp": 0.01279251, + "balance_loss_clip": 1.14241076, + "balance_loss_mlp": 1.03587341, + "epoch": 0.584380448505982, + "flos": 13883551768800.0, + "grad_norm": 2.18559549038058, + "language_loss": 0.77345365, + "learning_rate": 1.554307207298465e-06, + "loss": 0.80130833, + "num_input_tokens_seen": 104768550, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.4296875, + "step": 4860, + "time_per_iteration": 3.0709686279296875 + }, + { + "auxiliary_loss_clip": 0.01507464, + "auxiliary_loss_mlp": 0.01272285, + "balance_loss_clip": 1.14450479, + "balance_loss_mlp": 1.02642751, + "epoch": 0.5845006913966212, + "flos": 21545875159200.0, + "grad_norm": 2.356945489404575, + "language_loss": 0.78975308, + "learning_rate": 1.553547857831704e-06, + "loss": 0.81755054, + "num_input_tokens_seen": 104785060, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.45507812, + "step": 4861, + "time_per_iteration": 2.9749884605407715 + }, + { + "auxiliary_loss_clip": 0.01534059, + "auxiliary_loss_mlp": 0.01211952, + "balance_loss_clip": 1.1781621, + "balance_loss_mlp": 1.01129913, + "epoch": 0.5846209342872603, + "flos": 58380668408160.0, + "grad_norm": 0.8958459296866614, + "language_loss": 0.64130032, + "learning_rate": 1.5527885760913771e-06, + "loss": 0.66876042, + "num_input_tokens_seen": 104834950, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.0078125, + "step": 4862, + "time_per_iteration": 3.3458404541015625 + }, + { + "auxiliary_loss_clip": 0.01517221, + "auxiliary_loss_mlp": 0.0126558, + "balance_loss_clip": 1.15544116, + "balance_loss_mlp": 1.0235374, + "epoch": 0.5847411771778993, + "flos": 18590028284160.0, + "grad_norm": 1.8035039534446544, + "language_loss": 0.76265335, + "learning_rate": 1.552029362192668e-06, + "loss": 0.79048133, + "num_input_tokens_seen": 104854210, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.41601562, + "step": 4863, + "time_per_iteration": 3.041259527206421 + }, + { + "auxiliary_loss_clip": 0.01514839, + "auxiliary_loss_mlp": 0.0127478, + "balance_loss_clip": 1.15224695, + "balance_loss_mlp": 1.02930415, + "epoch": 0.5848614200685385, + "flos": 24242935648320.0, + "grad_norm": 2.877149974241721, + "language_loss": 0.72362101, + "learning_rate": 1.5512702162507478e-06, + "loss": 0.75151718, + "num_input_tokens_seen": 104874525, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.45117188, + "step": 4864, + "time_per_iteration": 3.018397331237793 + }, + { + "auxiliary_loss_clip": 0.01535126, + "auxiliary_loss_mlp": 0.01205849, + "balance_loss_clip": 1.17977285, + "balance_loss_mlp": 1.0067215, + "epoch": 0.5849816629591775, + "flos": 71666428618080.0, + "grad_norm": 1.1312921107087424, + "language_loss": 0.5570485, + "learning_rate": 1.5505111383807792e-06, + "loss": 0.58445829, + "num_input_tokens_seen": 104937195, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 1.98828125, + "step": 4865, + "time_per_iteration": 3.4950883388519287 + }, + { + "auxiliary_loss_clip": 0.01506913, + "auxiliary_loss_mlp": 0.01270112, + "balance_loss_clip": 1.14374185, + "balance_loss_mlp": 1.02444577, + "epoch": 0.5851019058498166, + "flos": 23804178321600.0, + "grad_norm": 1.918240020648305, + "language_loss": 0.80333924, + "learning_rate": 1.5497521286979138e-06, + "loss": 0.8311094, + "num_input_tokens_seen": 104957435, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.453125, + "step": 4866, + "time_per_iteration": 3.0516347885131836 + }, + { + "auxiliary_loss_clip": 0.01505172, + "auxiliary_loss_mlp": 0.01282622, + "balance_loss_clip": 1.14189637, + "balance_loss_mlp": 1.0340941, + "epoch": 0.5852221487404557, + "flos": 24390781073280.0, + "grad_norm": 2.217473084347132, + "language_loss": 0.74637485, + "learning_rate": 1.5489931873172927e-06, + "loss": 0.77425277, + "num_input_tokens_seen": 104978755, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.48242188, + "step": 4867, + "time_per_iteration": 3.1484274864196777 + }, + { + "auxiliary_loss_clip": 0.01514586, + "auxiliary_loss_mlp": 0.01269353, + "balance_loss_clip": 1.15312886, + "balance_loss_mlp": 1.02654731, + "epoch": 0.5853423916310948, + "flos": 27273881008800.0, + "grad_norm": 8.188418800913007, + "language_loss": 0.79244304, + "learning_rate": 1.5482343143540467e-06, + "loss": 0.8202824, + "num_input_tokens_seen": 105000020, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.42382812, + "step": 4868, + "time_per_iteration": 3.039201498031616 + }, + { + "auxiliary_loss_clip": 0.01508164, + "auxiliary_loss_mlp": 0.01263264, + "balance_loss_clip": 1.14455628, + "balance_loss_mlp": 1.01969528, + "epoch": 0.5854626345217339, + "flos": 11985380128800.0, + "grad_norm": 2.802163257385514, + "language_loss": 0.82867604, + "learning_rate": 1.547475509923295e-06, + "loss": 0.85639036, + "num_input_tokens_seen": 105017060, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.43164062, + "step": 4869, + "time_per_iteration": 3.0014796257019043 + }, + { + "auxiliary_loss_clip": 0.01531157, + "auxiliary_loss_mlp": 0.01197578, + "balance_loss_clip": 1.17598712, + "balance_loss_mlp": 0.99768829, + "epoch": 0.585582877412373, + "flos": 64348989265440.0, + "grad_norm": 0.7366033065255231, + "language_loss": 0.56020117, + "learning_rate": 1.5467167741401495e-06, + "loss": 0.58748853, + "num_input_tokens_seen": 105078540, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 1.99609375, + "step": 4870, + "time_per_iteration": 4.461066961288452 + }, + { + "auxiliary_loss_clip": 0.0151714, + "auxiliary_loss_mlp": 0.01281231, + "balance_loss_clip": 1.1535728, + "balance_loss_mlp": 1.03594637, + "epoch": 0.5857031203030121, + "flos": 17013414863520.0, + "grad_norm": 2.379962918140489, + "language_loss": 0.71208942, + "learning_rate": 1.5459581071197083e-06, + "loss": 0.74007314, + "num_input_tokens_seen": 105094200, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.44921875, + "step": 4871, + "time_per_iteration": 3.9571146965026855 + }, + { + "auxiliary_loss_clip": 0.01514012, + "auxiliary_loss_mlp": 0.01273072, + "balance_loss_clip": 1.15065181, + "balance_loss_mlp": 1.027596, + "epoch": 0.5858233631936511, + "flos": 20887739169120.0, + "grad_norm": 3.3763849412526428, + "language_loss": 0.83044857, + "learning_rate": 1.5451995089770624e-06, + "loss": 0.8583194, + "num_input_tokens_seen": 105113985, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.45117188, + "step": 4872, + "time_per_iteration": 3.0144779682159424 + }, + { + "auxiliary_loss_clip": 0.01512671, + "auxiliary_loss_mlp": 0.01261539, + "balance_loss_clip": 1.14944434, + "balance_loss_mlp": 1.01911545, + "epoch": 0.5859436060842903, + "flos": 23194514884320.0, + "grad_norm": 1.371165463029806, + "language_loss": 0.72133213, + "learning_rate": 1.5444409798272885e-06, + "loss": 0.74907422, + "num_input_tokens_seen": 105138075, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.41992188, + "step": 4873, + "time_per_iteration": 3.044750690460205 + }, + { + "auxiliary_loss_clip": 0.0151985, + "auxiliary_loss_mlp": 0.01282885, + "balance_loss_clip": 1.15718722, + "balance_loss_mlp": 1.03626525, + "epoch": 0.5860638489749294, + "flos": 22494885122880.0, + "grad_norm": 2.3815331656836682, + "language_loss": 0.80885041, + "learning_rate": 1.543682519785456e-06, + "loss": 0.83687776, + "num_input_tokens_seen": 105156555, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.46289062, + "step": 4874, + "time_per_iteration": 3.138822555541992 + }, + { + "auxiliary_loss_clip": 0.01511346, + "auxiliary_loss_mlp": 0.01270075, + "balance_loss_clip": 1.14895153, + "balance_loss_mlp": 1.02822351, + "epoch": 0.5861840918655684, + "flos": 17568157596480.0, + "grad_norm": 3.175987635960249, + "language_loss": 0.80942643, + "learning_rate": 1.5429241289666219e-06, + "loss": 0.83724064, + "num_input_tokens_seen": 105174055, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.4140625, + "step": 4875, + "time_per_iteration": 3.8680641651153564 + }, + { + "auxiliary_loss_clip": 0.01508971, + "auxiliary_loss_mlp": 0.01271783, + "balance_loss_clip": 1.1470356, + "balance_loss_mlp": 1.02878642, + "epoch": 0.5863043347562076, + "flos": 25558486993440.0, + "grad_norm": 2.1106663321924093, + "language_loss": 0.69955224, + "learning_rate": 1.5421658074858342e-06, + "loss": 0.72735977, + "num_input_tokens_seen": 105192160, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.42578125, + "step": 4876, + "time_per_iteration": 3.0256283283233643 + }, + { + "auxiliary_loss_clip": 0.01515792, + "auxiliary_loss_mlp": 0.01279033, + "balance_loss_clip": 1.15273046, + "balance_loss_mlp": 1.03222239, + "epoch": 0.5864245776468466, + "flos": 20669612135040.0, + "grad_norm": 3.037300052471413, + "language_loss": 0.66790998, + "learning_rate": 1.5414075554581298e-06, + "loss": 0.69585818, + "num_input_tokens_seen": 105210205, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.46484375, + "step": 4877, + "time_per_iteration": 3.029360771179199 + }, + { + "auxiliary_loss_clip": 0.01506945, + "auxiliary_loss_mlp": 0.01269208, + "balance_loss_clip": 1.14381242, + "balance_loss_mlp": 1.02373183, + "epoch": 0.5865448205374857, + "flos": 28916490156480.0, + "grad_norm": 2.548810716862624, + "language_loss": 0.7851336, + "learning_rate": 1.5406493729985348e-06, + "loss": 0.81289506, + "num_input_tokens_seen": 105229400, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.45117188, + "step": 4878, + "time_per_iteration": 3.0458712577819824 + }, + { + "auxiliary_loss_clip": 0.01513927, + "auxiliary_loss_mlp": 0.01273611, + "balance_loss_clip": 1.15018129, + "balance_loss_mlp": 1.02889788, + "epoch": 0.5866650634281249, + "flos": 25844619947040.0, + "grad_norm": 2.3706186396434736, + "language_loss": 0.72018957, + "learning_rate": 1.5398912602220644e-06, + "loss": 0.74806488, + "num_input_tokens_seen": 105248675, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.44335938, + "step": 4879, + "time_per_iteration": 3.1159684658050537 + }, + { + "auxiliary_loss_clip": 0.01519332, + "auxiliary_loss_mlp": 0.01280513, + "balance_loss_clip": 1.15635788, + "balance_loss_mlp": 1.03465581, + "epoch": 0.5867853063187639, + "flos": 17054036287200.0, + "grad_norm": 2.2494625932811614, + "language_loss": 0.78571224, + "learning_rate": 1.539133217243724e-06, + "loss": 0.81371069, + "num_input_tokens_seen": 105265695, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.45507812, + "step": 4880, + "time_per_iteration": 3.805468797683716 + }, + { + "auxiliary_loss_clip": 0.015116, + "auxiliary_loss_mlp": 0.01278675, + "balance_loss_clip": 1.14812279, + "balance_loss_mlp": 1.02957499, + "epoch": 0.586905549209403, + "flos": 24647367625920.0, + "grad_norm": 2.6163472821131717, + "language_loss": 0.76311052, + "learning_rate": 1.5383752441785081e-06, + "loss": 0.79101324, + "num_input_tokens_seen": 105284920, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.48828125, + "step": 4881, + "time_per_iteration": 3.12606143951416 + }, + { + "auxiliary_loss_clip": 0.0151651, + "auxiliary_loss_mlp": 0.01279418, + "balance_loss_clip": 1.152915, + "balance_loss_mlp": 1.03356099, + "epoch": 0.5870257921000421, + "flos": 14722303478400.0, + "grad_norm": 3.491107822541365, + "language_loss": 0.86465257, + "learning_rate": 1.5376173411414003e-06, + "loss": 0.89261192, + "num_input_tokens_seen": 105302960, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.45507812, + "step": 4882, + "time_per_iteration": 3.1623730659484863 + }, + { + "auxiliary_loss_clip": 0.01518031, + "auxiliary_loss_mlp": 0.01269504, + "balance_loss_clip": 1.1542207, + "balance_loss_mlp": 1.0240283, + "epoch": 0.5871460349906812, + "flos": 23917281187680.0, + "grad_norm": 2.0767061613875897, + "language_loss": 0.79469597, + "learning_rate": 1.5368595082473753e-06, + "loss": 0.82257128, + "num_input_tokens_seen": 105321260, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.45117188, + "step": 4883, + "time_per_iteration": 3.067577838897705 + }, + { + "auxiliary_loss_clip": 0.01514204, + "auxiliary_loss_mlp": 0.0126942, + "balance_loss_clip": 1.15125167, + "balance_loss_mlp": 1.02642405, + "epoch": 0.5872662778813202, + "flos": 22166386050240.0, + "grad_norm": 1.7163061276848952, + "language_loss": 0.78412032, + "learning_rate": 1.5361017456113935e-06, + "loss": 0.81195652, + "num_input_tokens_seen": 105341610, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.42578125, + "step": 4884, + "time_per_iteration": 3.130305290222168 + }, + { + "auxiliary_loss_clip": 0.01511711, + "auxiliary_loss_mlp": 0.01265002, + "balance_loss_clip": 1.14733875, + "balance_loss_mlp": 1.02124262, + "epoch": 0.5873865207719594, + "flos": 18443889626400.0, + "grad_norm": 2.2239577388085507, + "language_loss": 0.85612369, + "learning_rate": 1.5353440533484085e-06, + "loss": 0.88389075, + "num_input_tokens_seen": 105360465, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.43359375, + "step": 4885, + "time_per_iteration": 3.089254379272461 + }, + { + "auxiliary_loss_clip": 0.01514095, + "auxiliary_loss_mlp": 0.01283791, + "balance_loss_clip": 1.14891768, + "balance_loss_mlp": 1.04041362, + "epoch": 0.5875067636625985, + "flos": 54019265935680.0, + "grad_norm": 1.9585188969331655, + "language_loss": 0.66244709, + "learning_rate": 1.534586431573361e-06, + "loss": 0.69042599, + "num_input_tokens_seen": 105385405, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.4296875, + "step": 4886, + "time_per_iteration": 3.2758870124816895 + }, + { + "auxiliary_loss_clip": 0.01514155, + "auxiliary_loss_mlp": 0.01269938, + "balance_loss_clip": 1.14932179, + "balance_loss_mlp": 1.02484357, + "epoch": 0.5876270065532375, + "flos": 27997898941440.0, + "grad_norm": 2.6121697172988116, + "language_loss": 0.7908895, + "learning_rate": 1.5338288804011817e-06, + "loss": 0.81873047, + "num_input_tokens_seen": 105404905, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.44726562, + "step": 4887, + "time_per_iteration": 2.974247932434082 + }, + { + "auxiliary_loss_clip": 0.01508874, + "auxiliary_loss_mlp": 0.01269815, + "balance_loss_clip": 1.14416945, + "balance_loss_mlp": 1.02319527, + "epoch": 0.5877472494438767, + "flos": 21363628528800.0, + "grad_norm": 2.0411676796287628, + "language_loss": 0.71329582, + "learning_rate": 1.533071399946791e-06, + "loss": 0.74108279, + "num_input_tokens_seen": 105423650, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.46289062, + "step": 4888, + "time_per_iteration": 3.0051920413970947 + }, + { + "auxiliary_loss_clip": 0.0151342, + "auxiliary_loss_mlp": 0.01272014, + "balance_loss_clip": 1.15222669, + "balance_loss_mlp": 1.02844548, + "epoch": 0.5878674923345157, + "flos": 22385233719360.0, + "grad_norm": 2.073168356028948, + "language_loss": 0.57601917, + "learning_rate": 1.5323139903250977e-06, + "loss": 0.60387349, + "num_input_tokens_seen": 105444255, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.43164062, + "step": 4889, + "time_per_iteration": 3.131582498550415 + }, + { + "auxiliary_loss_clip": 0.01519419, + "auxiliary_loss_mlp": 0.01282061, + "balance_loss_clip": 1.15666604, + "balance_loss_mlp": 1.03505969, + "epoch": 0.5879877352251548, + "flos": 21870657272160.0, + "grad_norm": 1.6777117134489743, + "language_loss": 0.77037126, + "learning_rate": 1.5315566516510002e-06, + "loss": 0.7983861, + "num_input_tokens_seen": 105462425, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.46679688, + "step": 4890, + "time_per_iteration": 3.0056753158569336 + }, + { + "auxiliary_loss_clip": 0.01515142, + "auxiliary_loss_mlp": 0.01282503, + "balance_loss_clip": 1.15219069, + "balance_loss_mlp": 1.03512049, + "epoch": 0.5881079781157939, + "flos": 17495752010400.0, + "grad_norm": 1.8615174887396249, + "language_loss": 0.67477357, + "learning_rate": 1.5307993840393857e-06, + "loss": 0.70275003, + "num_input_tokens_seen": 105480505, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.47070312, + "step": 4891, + "time_per_iteration": 2.9894564151763916 + }, + { + "auxiliary_loss_clip": 0.01510475, + "auxiliary_loss_mlp": 0.01269235, + "balance_loss_clip": 1.14803338, + "balance_loss_mlp": 1.02700114, + "epoch": 0.588228221006433, + "flos": 22604081388480.0, + "grad_norm": 2.4532625090630544, + "language_loss": 0.80260539, + "learning_rate": 1.530042187605132e-06, + "loss": 0.83040243, + "num_input_tokens_seen": 105499760, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.41796875, + "step": 4892, + "time_per_iteration": 2.9625303745269775 + }, + { + "auxiliary_loss_clip": 0.0151851, + "auxiliary_loss_mlp": 0.01278373, + "balance_loss_clip": 1.15532911, + "balance_loss_mlp": 1.03537714, + "epoch": 0.5883484638970721, + "flos": 26179339237920.0, + "grad_norm": 1.870543854040571, + "language_loss": 0.8409543, + "learning_rate": 1.5292850624631044e-06, + "loss": 0.86892319, + "num_input_tokens_seen": 105521955, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.42578125, + "step": 4893, + "time_per_iteration": 3.1243443489074707 + }, + { + "auxiliary_loss_clip": 0.01514451, + "auxiliary_loss_mlp": 0.01260494, + "balance_loss_clip": 1.15079546, + "balance_loss_mlp": 1.01711583, + "epoch": 0.5884687067877111, + "flos": 30446261935200.0, + "grad_norm": 2.0257505143983967, + "language_loss": 0.80640137, + "learning_rate": 1.5285280087281593e-06, + "loss": 0.83415079, + "num_input_tokens_seen": 105542685, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.4296875, + "step": 4894, + "time_per_iteration": 3.1865768432617188 + }, + { + "auxiliary_loss_clip": 0.0153841, + "auxiliary_loss_mlp": 0.01196556, + "balance_loss_clip": 1.18242121, + "balance_loss_mlp": 0.99971771, + "epoch": 0.5885889496783503, + "flos": 70514500812480.0, + "grad_norm": 0.6520634339761556, + "language_loss": 0.56557679, + "learning_rate": 1.5277710265151398e-06, + "loss": 0.59292638, + "num_input_tokens_seen": 105612165, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 1.96875, + "step": 4895, + "time_per_iteration": 3.6265029907226562 + }, + { + "auxiliary_loss_clip": 0.0151938, + "auxiliary_loss_mlp": 0.01276547, + "balance_loss_clip": 1.15720427, + "balance_loss_mlp": 1.03164339, + "epoch": 0.5887091925689893, + "flos": 19100773987200.0, + "grad_norm": 4.3660286370082355, + "language_loss": 0.77483147, + "learning_rate": 1.5270141159388803e-06, + "loss": 0.80279076, + "num_input_tokens_seen": 105629185, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.4453125, + "step": 4896, + "time_per_iteration": 3.0199568271636963 + }, + { + "auxiliary_loss_clip": 0.01506813, + "auxiliary_loss_mlp": 0.01276905, + "balance_loss_clip": 1.14282048, + "balance_loss_mlp": 1.03295541, + "epoch": 0.5888294354596284, + "flos": 23296504799520.0, + "grad_norm": 1.9478883329446905, + "language_loss": 0.80462146, + "learning_rate": 1.526257277114203e-06, + "loss": 0.83245873, + "num_input_tokens_seen": 105650260, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.43554688, + "step": 4897, + "time_per_iteration": 2.9852020740509033 + }, + { + "auxiliary_loss_clip": 0.01512231, + "auxiliary_loss_mlp": 0.01268899, + "balance_loss_clip": 1.14913678, + "balance_loss_mlp": 1.02628481, + "epoch": 0.5889496783502676, + "flos": 21983722210080.0, + "grad_norm": 1.8485378868295692, + "language_loss": 0.79916966, + "learning_rate": 1.5255005101559201e-06, + "loss": 0.82698101, + "num_input_tokens_seen": 105667870, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.421875, + "step": 4898, + "time_per_iteration": 3.9594321250915527 + }, + { + "auxiliary_loss_clip": 0.01513602, + "auxiliary_loss_mlp": 0.01268625, + "balance_loss_clip": 1.14996409, + "balance_loss_mlp": 1.02429318, + "epoch": 0.5890699212409066, + "flos": 21687159012480.0, + "grad_norm": 2.7480023755749303, + "language_loss": 0.77091837, + "learning_rate": 1.524743815178833e-06, + "loss": 0.79874063, + "num_input_tokens_seen": 105685830, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.43945312, + "step": 4899, + "time_per_iteration": 3.848947525024414 + }, + { + "auxiliary_loss_clip": 0.0151199, + "auxiliary_loss_mlp": 0.0126934, + "balance_loss_clip": 1.14804721, + "balance_loss_mlp": 1.02767849, + "epoch": 0.5891901641315457, + "flos": 19466632661760.0, + "grad_norm": 2.791648459528877, + "language_loss": 0.80499941, + "learning_rate": 1.5239871922977315e-06, + "loss": 0.83281267, + "num_input_tokens_seen": 105705745, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.41210938, + "step": 4900, + "time_per_iteration": 3.0497357845306396 + }, + { + "auxiliary_loss_clip": 0.01511045, + "auxiliary_loss_mlp": 0.01286331, + "balance_loss_clip": 1.14662981, + "balance_loss_mlp": 1.04066432, + "epoch": 0.5893104070221848, + "flos": 19611936900000.0, + "grad_norm": 2.4982206255465074, + "language_loss": 0.89576018, + "learning_rate": 1.523230641627394e-06, + "loss": 0.92373395, + "num_input_tokens_seen": 105724730, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.453125, + "step": 4901, + "time_per_iteration": 2.9976632595062256 + }, + { + "auxiliary_loss_clip": 0.01516491, + "auxiliary_loss_mlp": 0.0127988, + "balance_loss_clip": 1.15367436, + "balance_loss_mlp": 1.03650248, + "epoch": 0.5894306499128239, + "flos": 29062666742400.0, + "grad_norm": 3.0591711405829125, + "language_loss": 0.73125875, + "learning_rate": 1.5224741632825888e-06, + "loss": 0.75922239, + "num_input_tokens_seen": 105744920, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.4296875, + "step": 4902, + "time_per_iteration": 3.0722556114196777 + }, + { + "auxiliary_loss_clip": 0.01514713, + "auxiliary_loss_mlp": 0.01282016, + "balance_loss_clip": 1.14998722, + "balance_loss_mlp": 1.03711295, + "epoch": 0.589550892803463, + "flos": 42301926663840.0, + "grad_norm": 1.9132434120666928, + "language_loss": 0.69667804, + "learning_rate": 1.521717757378074e-06, + "loss": 0.72464532, + "num_input_tokens_seen": 105765465, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.4453125, + "step": 4903, + "time_per_iteration": 3.994880437850952 + }, + { + "auxiliary_loss_clip": 0.01513141, + "auxiliary_loss_mlp": 0.01277205, + "balance_loss_clip": 1.15088212, + "balance_loss_mlp": 1.02886808, + "epoch": 0.5896711356941021, + "flos": 14138810835840.0, + "grad_norm": 2.5632253254525525, + "language_loss": 0.69475675, + "learning_rate": 1.5209614240285943e-06, + "loss": 0.72266024, + "num_input_tokens_seen": 105783120, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.48046875, + "step": 4904, + "time_per_iteration": 3.126332998275757 + }, + { + "auxiliary_loss_clip": 0.01510387, + "auxiliary_loss_mlp": 0.0128428, + "balance_loss_clip": 1.14590311, + "balance_loss_mlp": 1.03956747, + "epoch": 0.5897913785847412, + "flos": 17203247125920.0, + "grad_norm": 4.118752369357894, + "language_loss": 0.84768975, + "learning_rate": 1.520205163348887e-06, + "loss": 0.8756364, + "num_input_tokens_seen": 105801055, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.44335938, + "step": 4905, + "time_per_iteration": 3.055690050125122 + }, + { + "auxiliary_loss_clip": 0.0153823, + "auxiliary_loss_mlp": 0.0121537, + "balance_loss_clip": 1.18162704, + "balance_loss_mlp": 1.01548004, + "epoch": 0.5899116214753802, + "flos": 48799653878880.0, + "grad_norm": 0.7448419914390185, + "language_loss": 0.56939054, + "learning_rate": 1.519448975453674e-06, + "loss": 0.59692651, + "num_input_tokens_seen": 105856155, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 1.9921875, + "step": 4906, + "time_per_iteration": 3.310044765472412 + }, + { + "auxiliary_loss_clip": 0.01517611, + "auxiliary_loss_mlp": 0.01281502, + "balance_loss_clip": 1.1545577, + "balance_loss_mlp": 1.03488207, + "epoch": 0.5900318643660194, + "flos": 21105676562400.0, + "grad_norm": 2.3957195953956503, + "language_loss": 0.76076281, + "learning_rate": 1.5186928604576696e-06, + "loss": 0.78875399, + "num_input_tokens_seen": 105873350, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.46289062, + "step": 4907, + "time_per_iteration": 3.0633721351623535 + }, + { + "auxiliary_loss_clip": 0.01515568, + "auxiliary_loss_mlp": 0.01286652, + "balance_loss_clip": 1.15183449, + "balance_loss_mlp": 1.04212987, + "epoch": 0.5901521072566585, + "flos": 21180509550720.0, + "grad_norm": 2.4961618412686875, + "language_loss": 0.77341497, + "learning_rate": 1.5179368184755752e-06, + "loss": 0.80143714, + "num_input_tokens_seen": 105891435, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.44140625, + "step": 4908, + "time_per_iteration": 3.814807415008545 + }, + { + "auxiliary_loss_clip": 0.01513881, + "auxiliary_loss_mlp": 0.01268039, + "balance_loss_clip": 1.15159774, + "balance_loss_mlp": 1.02428007, + "epoch": 0.5902723501472975, + "flos": 20227820555520.0, + "grad_norm": 1.6436982000028904, + "language_loss": 0.82856047, + "learning_rate": 1.5171808496220821e-06, + "loss": 0.85637963, + "num_input_tokens_seen": 105910190, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.43359375, + "step": 4909, + "time_per_iteration": 2.9919748306274414 + }, + { + "auxiliary_loss_clip": 0.01513169, + "auxiliary_loss_mlp": 0.01272666, + "balance_loss_clip": 1.14940226, + "balance_loss_mlp": 1.02585483, + "epoch": 0.5903925930379367, + "flos": 22966526528640.0, + "grad_norm": 2.0273738936307946, + "language_loss": 0.81589556, + "learning_rate": 1.5164249540118708e-06, + "loss": 0.84375393, + "num_input_tokens_seen": 105929315, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.46484375, + "step": 4910, + "time_per_iteration": 3.020756721496582 + }, + { + "auxiliary_loss_clip": 0.01511106, + "auxiliary_loss_mlp": 0.01285586, + "balance_loss_clip": 1.14819801, + "balance_loss_mlp": 1.04125476, + "epoch": 0.5905128359285757, + "flos": 23369972374080.0, + "grad_norm": 1.9268120420960104, + "language_loss": 0.83256781, + "learning_rate": 1.5156691317596093e-06, + "loss": 0.86053473, + "num_input_tokens_seen": 105950740, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.43945312, + "step": 4911, + "time_per_iteration": 3.0355770587921143 + }, + { + "auxiliary_loss_clip": 0.01516621, + "auxiliary_loss_mlp": 0.01275238, + "balance_loss_clip": 1.15266502, + "balance_loss_mlp": 1.03262329, + "epoch": 0.5906330788192148, + "flos": 28034537908320.0, + "grad_norm": 2.19782766093713, + "language_loss": 0.6670056, + "learning_rate": 1.5149133829799556e-06, + "loss": 0.69492418, + "num_input_tokens_seen": 105968735, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.421875, + "step": 4912, + "time_per_iteration": 3.069413900375366 + }, + { + "auxiliary_loss_clip": 0.01515171, + "auxiliary_loss_mlp": 0.01281568, + "balance_loss_clip": 1.15130675, + "balance_loss_mlp": 1.03513861, + "epoch": 0.590753321709854, + "flos": 18479883814560.0, + "grad_norm": 1.9799618283068348, + "language_loss": 0.80951983, + "learning_rate": 1.5141577077875556e-06, + "loss": 0.83748722, + "num_input_tokens_seen": 105986060, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.4609375, + "step": 4913, + "time_per_iteration": 3.080842971801758 + }, + { + "auxiliary_loss_clip": 0.01515915, + "auxiliary_loss_mlp": 0.01275325, + "balance_loss_clip": 1.15165043, + "balance_loss_mlp": 1.031757, + "epoch": 0.590873564600493, + "flos": 16875620400960.0, + "grad_norm": 1.9687779351993662, + "language_loss": 0.72592294, + "learning_rate": 1.5134021062970451e-06, + "loss": 0.75383532, + "num_input_tokens_seen": 106004440, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.43164062, + "step": 4914, + "time_per_iteration": 3.0138206481933594 + }, + { + "auxiliary_loss_clip": 0.01518593, + "auxiliary_loss_mlp": 0.01267564, + "balance_loss_clip": 1.15528524, + "balance_loss_mlp": 1.02418602, + "epoch": 0.5909938074911321, + "flos": 13517844806880.0, + "grad_norm": 1.9912647249290363, + "language_loss": 0.8123073, + "learning_rate": 1.5126465786230483e-06, + "loss": 0.84016889, + "num_input_tokens_seen": 106021215, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.4296875, + "step": 4915, + "time_per_iteration": 3.102245807647705 + }, + { + "auxiliary_loss_clip": 0.01515554, + "auxiliary_loss_mlp": 0.01280225, + "balance_loss_clip": 1.15079451, + "balance_loss_mlp": 1.03455889, + "epoch": 0.5911140503817712, + "flos": 26026070086080.0, + "grad_norm": 2.618583882658044, + "language_loss": 0.82021034, + "learning_rate": 1.5118911248801787e-06, + "loss": 0.84816813, + "num_input_tokens_seen": 106039225, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.453125, + "step": 4916, + "time_per_iteration": 3.199063539505005 + }, + { + "auxiliary_loss_clip": 0.01508046, + "auxiliary_loss_mlp": 0.01275288, + "balance_loss_clip": 1.14436781, + "balance_loss_mlp": 1.02981257, + "epoch": 0.5912342932724103, + "flos": 23261003677440.0, + "grad_norm": 4.144163019748825, + "language_loss": 0.79845083, + "learning_rate": 1.5111357451830364e-06, + "loss": 0.82628417, + "num_input_tokens_seen": 106057920, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.45117188, + "step": 4917, + "time_per_iteration": 3.008730411529541 + }, + { + "auxiliary_loss_clip": 0.01514302, + "auxiliary_loss_mlp": 0.01277488, + "balance_loss_clip": 1.15046775, + "balance_loss_mlp": 1.03430104, + "epoch": 0.5913545361630493, + "flos": 19575070364160.0, + "grad_norm": 2.2149326319156315, + "language_loss": 0.71008438, + "learning_rate": 1.5103804396462131e-06, + "loss": 0.7380023, + "num_input_tokens_seen": 106077855, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.42773438, + "step": 4918, + "time_per_iteration": 3.0287721157073975 + }, + { + "auxiliary_loss_clip": 0.01511369, + "auxiliary_loss_mlp": 0.0127875, + "balance_loss_clip": 1.1460222, + "balance_loss_mlp": 1.03098583, + "epoch": 0.5914747790536885, + "flos": 26215940276640.0, + "grad_norm": 1.9174209888112173, + "language_loss": 0.80005395, + "learning_rate": 1.5096252083842877e-06, + "loss": 0.82795513, + "num_input_tokens_seen": 106097065, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.47460938, + "step": 4919, + "time_per_iteration": 3.0470800399780273 + }, + { + "auxiliary_loss_clip": 0.01514649, + "auxiliary_loss_mlp": 0.01272743, + "balance_loss_clip": 1.1494534, + "balance_loss_mlp": 1.02650452, + "epoch": 0.5915950219443276, + "flos": 27420019666560.0, + "grad_norm": 1.9535398259408965, + "language_loss": 0.85983384, + "learning_rate": 1.5088700515118285e-06, + "loss": 0.88770777, + "num_input_tokens_seen": 106116385, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 2.45898438, + "step": 4920, + "time_per_iteration": 3.089351177215576 + }, + { + "auxiliary_loss_clip": 0.01517665, + "auxiliary_loss_mlp": 0.01271306, + "balance_loss_clip": 1.15418839, + "balance_loss_mlp": 1.02430415, + "epoch": 0.5917152648349666, + "flos": 21910027066560.0, + "grad_norm": 1.6303537353913342, + "language_loss": 0.66720456, + "learning_rate": 1.508114969143392e-06, + "loss": 0.69509423, + "num_input_tokens_seen": 106136370, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.46679688, + "step": 4921, + "time_per_iteration": 3.087496042251587 + }, + { + "auxiliary_loss_clip": 0.01508359, + "auxiliary_loss_mlp": 0.01276394, + "balance_loss_clip": 1.14370203, + "balance_loss_mlp": 1.03244448, + "epoch": 0.5918355077256057, + "flos": 28111608658080.0, + "grad_norm": 1.7182417474236849, + "language_loss": 0.77629602, + "learning_rate": 1.5073599613935238e-06, + "loss": 0.80414349, + "num_input_tokens_seen": 106158490, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.43554688, + "step": 4922, + "time_per_iteration": 3.2087926864624023 + }, + { + "auxiliary_loss_clip": 0.01514468, + "auxiliary_loss_mlp": 0.01270628, + "balance_loss_clip": 1.1498481, + "balance_loss_mlp": 1.02610588, + "epoch": 0.5919557506162448, + "flos": 28186327861920.0, + "grad_norm": 2.101432663843386, + "language_loss": 0.57898033, + "learning_rate": 1.5066050283767574e-06, + "loss": 0.60683131, + "num_input_tokens_seen": 106179170, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 2.44140625, + "step": 4923, + "time_per_iteration": 3.197124481201172 + }, + { + "auxiliary_loss_clip": 0.01512985, + "auxiliary_loss_mlp": 0.0128047, + "balance_loss_clip": 1.14959466, + "balance_loss_mlp": 1.03861856, + "epoch": 0.5920759935068839, + "flos": 12096017664480.0, + "grad_norm": 3.7721747679158684, + "language_loss": 0.83012283, + "learning_rate": 1.505850170207616e-06, + "loss": 0.85805738, + "num_input_tokens_seen": 106196035, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.4140625, + "step": 4924, + "time_per_iteration": 3.0824952125549316 + }, + { + "auxiliary_loss_clip": 0.01509747, + "auxiliary_loss_mlp": 0.01274251, + "balance_loss_clip": 1.14646411, + "balance_loss_mlp": 1.02953839, + "epoch": 0.592196236397523, + "flos": 29427387572160.0, + "grad_norm": 2.4190783288184723, + "language_loss": 0.77738512, + "learning_rate": 1.505095387000611e-06, + "loss": 0.80522513, + "num_input_tokens_seen": 106218335, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.44335938, + "step": 4925, + "time_per_iteration": 3.1049857139587402 + }, + { + "auxiliary_loss_clip": 0.01514479, + "auxiliary_loss_mlp": 0.01268039, + "balance_loss_clip": 1.15119934, + "balance_loss_mlp": 1.02370763, + "epoch": 0.5923164792881621, + "flos": 24386760688320.0, + "grad_norm": 5.031851553751866, + "language_loss": 0.74031943, + "learning_rate": 1.504340678870242e-06, + "loss": 0.76814461, + "num_input_tokens_seen": 106236550, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.43945312, + "step": 4926, + "time_per_iteration": 4.02046275138855 + }, + { + "auxiliary_loss_clip": 0.01510924, + "auxiliary_loss_mlp": 0.01264928, + "balance_loss_clip": 1.14825177, + "balance_loss_mlp": 1.02040553, + "epoch": 0.5924367221788012, + "flos": 24026515381440.0, + "grad_norm": 2.4667263806755453, + "language_loss": 0.89746535, + "learning_rate": 1.5035860459309989e-06, + "loss": 0.92522389, + "num_input_tokens_seen": 106254265, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.44140625, + "step": 4927, + "time_per_iteration": 3.930455446243286 + }, + { + "auxiliary_loss_clip": 0.01516656, + "auxiliary_loss_mlp": 0.01271241, + "balance_loss_clip": 1.15291786, + "balance_loss_mlp": 1.02404857, + "epoch": 0.5925569650694402, + "flos": 26873052206400.0, + "grad_norm": 2.5916940190385054, + "language_loss": 0.63892722, + "learning_rate": 1.5028314882973568e-06, + "loss": 0.66680622, + "num_input_tokens_seen": 106274670, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.46875, + "step": 4928, + "time_per_iteration": 3.060356855392456 + }, + { + "auxiliary_loss_clip": 0.0151121, + "auxiliary_loss_mlp": 0.01278449, + "balance_loss_clip": 1.14833736, + "balance_loss_mlp": 1.03125691, + "epoch": 0.5926772079600794, + "flos": 22304597722560.0, + "grad_norm": 1.879851730808625, + "language_loss": 0.8409155, + "learning_rate": 1.502077006083783e-06, + "loss": 0.86881208, + "num_input_tokens_seen": 106293330, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.46875, + "step": 4929, + "time_per_iteration": 2.968636989593506 + }, + { + "auxiliary_loss_clip": 0.01511457, + "auxiliary_loss_mlp": 0.01266112, + "balance_loss_clip": 1.14796066, + "balance_loss_mlp": 1.02330637, + "epoch": 0.5927974508507184, + "flos": 19867309751520.0, + "grad_norm": 2.329039959861209, + "language_loss": 0.76892608, + "learning_rate": 1.5013225994047315e-06, + "loss": 0.79670179, + "num_input_tokens_seen": 106310960, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.42382812, + "step": 4930, + "time_per_iteration": 4.02327823638916 + }, + { + "auxiliary_loss_clip": 0.01512255, + "auxiliary_loss_mlp": 0.01268773, + "balance_loss_clip": 1.14866114, + "balance_loss_mlp": 1.02444148, + "epoch": 0.5929176937413575, + "flos": 15778461587040.0, + "grad_norm": 1.7899974844897213, + "language_loss": 0.80775118, + "learning_rate": 1.5005682683746452e-06, + "loss": 0.83556145, + "num_input_tokens_seen": 106329475, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.43945312, + "step": 4931, + "time_per_iteration": 3.1544911861419678 + }, + { + "auxiliary_loss_clip": 0.01516144, + "auxiliary_loss_mlp": 0.0127969, + "balance_loss_clip": 1.15226579, + "balance_loss_mlp": 1.03497767, + "epoch": 0.5930379366319967, + "flos": 17603317365120.0, + "grad_norm": 2.3463693154458873, + "language_loss": 0.72829938, + "learning_rate": 1.4998140131079553e-06, + "loss": 0.75625771, + "num_input_tokens_seen": 106345565, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.44335938, + "step": 4932, + "time_per_iteration": 3.049086332321167 + }, + { + "auxiliary_loss_clip": 0.01511522, + "auxiliary_loss_mlp": 0.01275571, + "balance_loss_clip": 1.14731419, + "balance_loss_mlp": 1.03104937, + "epoch": 0.5931581795226357, + "flos": 17705762418240.0, + "grad_norm": 1.8985947959600906, + "language_loss": 0.73611176, + "learning_rate": 1.4990598337190821e-06, + "loss": 0.76398265, + "num_input_tokens_seen": 106361920, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.44140625, + "step": 4933, + "time_per_iteration": 3.050868034362793 + }, + { + "auxiliary_loss_clip": 0.01513628, + "auxiliary_loss_mlp": 0.01273244, + "balance_loss_clip": 1.15069962, + "balance_loss_mlp": 1.02872169, + "epoch": 0.5932784224132748, + "flos": 24282419227200.0, + "grad_norm": 1.8692068206697758, + "language_loss": 0.68090844, + "learning_rate": 1.4983057303224338e-06, + "loss": 0.70877713, + "num_input_tokens_seen": 106381735, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.44140625, + "step": 4934, + "time_per_iteration": 3.1294937133789062 + }, + { + "auxiliary_loss_clip": 0.01517927, + "auxiliary_loss_mlp": 0.01281658, + "balance_loss_clip": 1.15287638, + "balance_loss_mlp": 1.03942454, + "epoch": 0.5933986653039139, + "flos": 22928749716960.0, + "grad_norm": 2.1794849565893437, + "language_loss": 0.8739686, + "learning_rate": 1.4975517030324072e-06, + "loss": 0.90196443, + "num_input_tokens_seen": 106399745, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.41796875, + "step": 4935, + "time_per_iteration": 3.8976199626922607 + }, + { + "auxiliary_loss_clip": 0.01523449, + "auxiliary_loss_mlp": 0.01204445, + "balance_loss_clip": 1.16643596, + "balance_loss_mlp": 1.00760651, + "epoch": 0.593518908194553, + "flos": 71128374275520.0, + "grad_norm": 0.7884199685324277, + "language_loss": 0.61729014, + "learning_rate": 1.4967977519633882e-06, + "loss": 0.6445691, + "num_input_tokens_seen": 106457205, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 1.96484375, + "step": 4936, + "time_per_iteration": 3.645092010498047 + }, + { + "auxiliary_loss_clip": 0.01509563, + "auxiliary_loss_mlp": 0.01272891, + "balance_loss_clip": 1.14621139, + "balance_loss_mlp": 1.0281781, + "epoch": 0.593639151085192, + "flos": 20450726537760.0, + "grad_norm": 2.089307423812397, + "language_loss": 0.78327817, + "learning_rate": 1.4960438772297494e-06, + "loss": 0.81110269, + "num_input_tokens_seen": 106474250, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.44335938, + "step": 4937, + "time_per_iteration": 3.1424930095672607 + }, + { + "auxiliary_loss_clip": 0.01506962, + "auxiliary_loss_mlp": 0.01271026, + "balance_loss_clip": 1.14323568, + "balance_loss_mlp": 1.02478695, + "epoch": 0.5937593939758312, + "flos": 30886081250400.0, + "grad_norm": 2.17856836531395, + "language_loss": 0.73293364, + "learning_rate": 1.495290078945855e-06, + "loss": 0.76071346, + "num_input_tokens_seen": 106494015, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.45898438, + "step": 4938, + "time_per_iteration": 3.161539077758789 + }, + { + "auxiliary_loss_clip": 0.01510978, + "auxiliary_loss_mlp": 0.01276108, + "balance_loss_clip": 1.14690351, + "balance_loss_mlp": 1.0340656, + "epoch": 0.5938796368664703, + "flos": 36900751047840.0, + "grad_norm": 2.0545613375775806, + "language_loss": 0.7428242, + "learning_rate": 1.4945363572260529e-06, + "loss": 0.77069503, + "num_input_tokens_seen": 106515010, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.41601562, + "step": 4939, + "time_per_iteration": 3.1585354804992676 + }, + { + "auxiliary_loss_clip": 0.0151543, + "auxiliary_loss_mlp": 0.01262991, + "balance_loss_clip": 1.15254331, + "balance_loss_mlp": 1.01885033, + "epoch": 0.5939998797571093, + "flos": 23845254883200.0, + "grad_norm": 3.682305031967398, + "language_loss": 0.68210423, + "learning_rate": 1.4937827121846845e-06, + "loss": 0.7098884, + "num_input_tokens_seen": 106535265, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.4375, + "step": 4940, + "time_per_iteration": 3.0738463401794434 + }, + { + "auxiliary_loss_clip": 0.01516957, + "auxiliary_loss_mlp": 0.01272167, + "balance_loss_clip": 1.15260363, + "balance_loss_mlp": 1.03126836, + "epoch": 0.5941201226477485, + "flos": 25193728235520.0, + "grad_norm": 2.575637930417707, + "language_loss": 0.73057783, + "learning_rate": 1.4930291439360755e-06, + "loss": 0.7584691, + "num_input_tokens_seen": 106557830, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.40429688, + "step": 4941, + "time_per_iteration": 3.106577157974243 + }, + { + "auxiliary_loss_clip": 0.01517388, + "auxiliary_loss_mlp": 0.01266748, + "balance_loss_clip": 1.15366864, + "balance_loss_mlp": 1.02241671, + "epoch": 0.5942403655383875, + "flos": 22421189979360.0, + "grad_norm": 2.602732259453385, + "language_loss": 0.79035532, + "learning_rate": 1.4922756525945427e-06, + "loss": 0.81819665, + "num_input_tokens_seen": 106577140, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.43945312, + "step": 4942, + "time_per_iteration": 3.139582395553589 + }, + { + "auxiliary_loss_clip": 0.01519071, + "auxiliary_loss_mlp": 0.01203461, + "balance_loss_clip": 1.16217804, + "balance_loss_mlp": 1.00585938, + "epoch": 0.5943606084290266, + "flos": 67636331537760.0, + "grad_norm": 0.7746094717168867, + "language_loss": 0.59530139, + "learning_rate": 1.4915222382743894e-06, + "loss": 0.62252671, + "num_input_tokens_seen": 106635975, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 1.96875, + "step": 4943, + "time_per_iteration": 3.4995360374450684 + }, + { + "auxiliary_loss_clip": 0.01514557, + "auxiliary_loss_mlp": 0.01276705, + "balance_loss_clip": 1.15100551, + "balance_loss_mlp": 1.03142023, + "epoch": 0.5944808513196658, + "flos": 18225117813600.0, + "grad_norm": 2.379040553539185, + "language_loss": 0.7169494, + "learning_rate": 1.4907689010899085e-06, + "loss": 0.74486196, + "num_input_tokens_seen": 106653555, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.44921875, + "step": 4944, + "time_per_iteration": 3.0542120933532715 + }, + { + "auxiliary_loss_clip": 0.01510358, + "auxiliary_loss_mlp": 0.01273326, + "balance_loss_clip": 1.1466428, + "balance_loss_mlp": 1.02956736, + "epoch": 0.5946010942103048, + "flos": 24793164930240.0, + "grad_norm": 2.787740635152034, + "language_loss": 0.6257531, + "learning_rate": 1.4900156411553804e-06, + "loss": 0.65358996, + "num_input_tokens_seen": 106673385, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.43359375, + "step": 4945, + "time_per_iteration": 3.038616180419922 + }, + { + "auxiliary_loss_clip": 0.01512516, + "auxiliary_loss_mlp": 0.012754, + "balance_loss_clip": 1.14955997, + "balance_loss_mlp": 1.03392982, + "epoch": 0.5947213371009439, + "flos": 15233428463040.0, + "grad_norm": 2.503147946760867, + "language_loss": 0.85543871, + "learning_rate": 1.4892624585850739e-06, + "loss": 0.88331789, + "num_input_tokens_seen": 106691740, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.41015625, + "step": 4946, + "time_per_iteration": 3.011084794998169 + }, + { + "auxiliary_loss_clip": 0.01512776, + "auxiliary_loss_mlp": 0.01275158, + "balance_loss_clip": 1.15002871, + "balance_loss_mlp": 1.02853739, + "epoch": 0.594841579991583, + "flos": 25850422955520.0, + "grad_norm": 3.059958808825439, + "language_loss": 0.79437864, + "learning_rate": 1.4885093534932465e-06, + "loss": 0.822258, + "num_input_tokens_seen": 106709705, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.46289062, + "step": 4947, + "time_per_iteration": 3.02898907661438 + }, + { + "auxiliary_loss_clip": 0.01516384, + "auxiliary_loss_mlp": 0.01280184, + "balance_loss_clip": 1.15366614, + "balance_loss_mlp": 1.0356617, + "epoch": 0.5949618228822221, + "flos": 23983011417600.0, + "grad_norm": 2.2057777667485716, + "language_loss": 0.71165931, + "learning_rate": 1.4877563259941433e-06, + "loss": 0.73962498, + "num_input_tokens_seen": 106727560, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.44140625, + "step": 4948, + "time_per_iteration": 3.164078712463379 + }, + { + "auxiliary_loss_clip": 0.01518385, + "auxiliary_loss_mlp": 0.01272409, + "balance_loss_clip": 1.15683949, + "balance_loss_mlp": 1.02502561, + "epoch": 0.5950820657728612, + "flos": 40550424675840.0, + "grad_norm": 2.174791405963103, + "language_loss": 0.68010116, + "learning_rate": 1.4870033762019988e-06, + "loss": 0.70800912, + "num_input_tokens_seen": 106747725, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.47070312, + "step": 4949, + "time_per_iteration": 3.206328868865967 + }, + { + "auxiliary_loss_clip": 0.01517561, + "auxiliary_loss_mlp": 0.01283185, + "balance_loss_clip": 1.15588677, + "balance_loss_mlp": 1.03637385, + "epoch": 0.5952023086635003, + "flos": 23186663755200.0, + "grad_norm": 1.9268680237401021, + "language_loss": 0.73215199, + "learning_rate": 1.4862505042310334e-06, + "loss": 0.76015943, + "num_input_tokens_seen": 106767010, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.46484375, + "step": 4950, + "time_per_iteration": 3.1486239433288574 + }, + { + "auxiliary_loss_clip": 0.01516072, + "auxiliary_loss_mlp": 0.01273645, + "balance_loss_clip": 1.15345812, + "balance_loss_mlp": 1.0310303, + "epoch": 0.5953225515541394, + "flos": 33655926607200.0, + "grad_norm": 1.8097008352310813, + "language_loss": 0.70160538, + "learning_rate": 1.4854977101954587e-06, + "loss": 0.72950256, + "num_input_tokens_seen": 106789230, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.421875, + "step": 4951, + "time_per_iteration": 3.1063425540924072 + }, + { + "auxiliary_loss_clip": 0.01512741, + "auxiliary_loss_mlp": 0.01271809, + "balance_loss_clip": 1.15120173, + "balance_loss_mlp": 1.02976656, + "epoch": 0.5954427944447784, + "flos": 24461783317440.0, + "grad_norm": 2.293518497512293, + "language_loss": 0.86287802, + "learning_rate": 1.4847449942094716e-06, + "loss": 0.89072353, + "num_input_tokens_seen": 106808110, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.41601562, + "step": 4952, + "time_per_iteration": 3.215226650238037 + }, + { + "auxiliary_loss_clip": 0.01514175, + "auxiliary_loss_mlp": 0.01273931, + "balance_loss_clip": 1.15176082, + "balance_loss_mlp": 1.03226995, + "epoch": 0.5955630373354175, + "flos": 18553844455200.0, + "grad_norm": 2.1969477699435145, + "language_loss": 0.86947155, + "learning_rate": 1.4839923563872598e-06, + "loss": 0.89735258, + "num_input_tokens_seen": 106826650, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.41210938, + "step": 4953, + "time_per_iteration": 3.2233309745788574 + }, + { + "auxiliary_loss_clip": 0.0151361, + "auxiliary_loss_mlp": 0.01272362, + "balance_loss_clip": 1.15080714, + "balance_loss_mlp": 1.03050995, + "epoch": 0.5956832802260567, + "flos": 19793728392480.0, + "grad_norm": 2.2646204255907345, + "language_loss": 0.75896776, + "learning_rate": 1.483239796842997e-06, + "loss": 0.7868275, + "num_input_tokens_seen": 106844680, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.4140625, + "step": 4954, + "time_per_iteration": 4.676898956298828 + }, + { + "auxiliary_loss_clip": 0.01514055, + "auxiliary_loss_mlp": 0.01262285, + "balance_loss_clip": 1.1515137, + "balance_loss_mlp": 1.02062416, + "epoch": 0.5958035231166957, + "flos": 19752651830880.0, + "grad_norm": 2.4315185767212446, + "language_loss": 0.83725101, + "learning_rate": 1.4824873156908462e-06, + "loss": 0.86501443, + "num_input_tokens_seen": 106862605, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.41210938, + "step": 4955, + "time_per_iteration": 3.013746976852417 + }, + { + "auxiliary_loss_clip": 0.01513664, + "auxiliary_loss_mlp": 0.0127443, + "balance_loss_clip": 1.15132618, + "balance_loss_mlp": 1.0293355, + "epoch": 0.5959237660073348, + "flos": 21654957640320.0, + "grad_norm": 1.763083915498044, + "language_loss": 0.75754762, + "learning_rate": 1.4817349130449584e-06, + "loss": 0.78542858, + "num_input_tokens_seen": 106882325, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.44726562, + "step": 4956, + "time_per_iteration": 3.0472347736358643 + }, + { + "auxiliary_loss_clip": 0.01510535, + "auxiliary_loss_mlp": 0.01266269, + "balance_loss_clip": 1.14734387, + "balance_loss_mlp": 1.02365458, + "epoch": 0.5960440088979739, + "flos": 21172923918720.0, + "grad_norm": 4.029633825032379, + "language_loss": 0.83287156, + "learning_rate": 1.4809825890194717e-06, + "loss": 0.86063957, + "num_input_tokens_seen": 106900995, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.421875, + "step": 4957, + "time_per_iteration": 3.9073078632354736 + }, + { + "auxiliary_loss_clip": 0.01517019, + "auxiliary_loss_mlp": 0.01268314, + "balance_loss_clip": 1.15404344, + "balance_loss_mlp": 1.02665257, + "epoch": 0.596164251788613, + "flos": 14759321726880.0, + "grad_norm": 1.7486521732831999, + "language_loss": 0.77040559, + "learning_rate": 1.4802303437285139e-06, + "loss": 0.79825896, + "num_input_tokens_seen": 106918265, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.41210938, + "step": 4958, + "time_per_iteration": 3.1038522720336914 + }, + { + "auxiliary_loss_clip": 0.01509256, + "auxiliary_loss_mlp": 0.0127228, + "balance_loss_clip": 1.14697313, + "balance_loss_mlp": 1.02909327, + "epoch": 0.596284494679252, + "flos": 20488048211520.0, + "grad_norm": 2.275596815948805, + "language_loss": 0.81133044, + "learning_rate": 1.4794781772861994e-06, + "loss": 0.83914578, + "num_input_tokens_seen": 106934760, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.42773438, + "step": 4959, + "time_per_iteration": 3.122623920440674 + }, + { + "auxiliary_loss_clip": 0.01513275, + "auxiliary_loss_mlp": 0.01266058, + "balance_loss_clip": 1.14904583, + "balance_loss_mlp": 1.02306223, + "epoch": 0.5964047375698912, + "flos": 31214959604640.0, + "grad_norm": 2.3430060394857977, + "language_loss": 0.66659451, + "learning_rate": 1.4787260898066324e-06, + "loss": 0.69438791, + "num_input_tokens_seen": 106954760, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.42578125, + "step": 4960, + "time_per_iteration": 3.264589548110962 + }, + { + "auxiliary_loss_clip": 0.01514284, + "auxiliary_loss_mlp": 0.01275887, + "balance_loss_clip": 1.15177, + "balance_loss_mlp": 1.03460753, + "epoch": 0.5965249804605303, + "flos": 27485711968320.0, + "grad_norm": 2.034047204979449, + "language_loss": 0.85820872, + "learning_rate": 1.4779740814039023e-06, + "loss": 0.88611048, + "num_input_tokens_seen": 106974845, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.40820312, + "step": 4961, + "time_per_iteration": 3.098872184753418 + }, + { + "auxiliary_loss_clip": 0.01511072, + "auxiliary_loss_mlp": 0.01265537, + "balance_loss_clip": 1.1482451, + "balance_loss_mlp": 1.02177811, + "epoch": 0.5966452233511693, + "flos": 30776581559520.0, + "grad_norm": 2.4302298791357586, + "language_loss": 0.68920624, + "learning_rate": 1.4772221521920894e-06, + "loss": 0.71697235, + "num_input_tokens_seen": 106994870, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.43359375, + "step": 4962, + "time_per_iteration": 3.905459403991699 + }, + { + "auxiliary_loss_clip": 0.01514051, + "auxiliary_loss_mlp": 0.01280325, + "balance_loss_clip": 1.15240014, + "balance_loss_mlp": 1.03809166, + "epoch": 0.5967654662418085, + "flos": 25483388508000.0, + "grad_norm": 2.0282233861205556, + "language_loss": 0.74284267, + "learning_rate": 1.4764703022852598e-06, + "loss": 0.77078646, + "num_input_tokens_seen": 107015390, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.41796875, + "step": 4963, + "time_per_iteration": 3.140653610229492 + }, + { + "auxiliary_loss_clip": 0.01510033, + "auxiliary_loss_mlp": 0.01259354, + "balance_loss_clip": 1.14831543, + "balance_loss_mlp": 1.01883745, + "epoch": 0.5968857091324475, + "flos": 19101153268800.0, + "grad_norm": 2.0376125946970425, + "language_loss": 0.76998281, + "learning_rate": 1.4757185317974696e-06, + "loss": 0.79767668, + "num_input_tokens_seen": 107033775, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.40039062, + "step": 4964, + "time_per_iteration": 3.039346933364868 + }, + { + "auxiliary_loss_clip": 0.01503745, + "auxiliary_loss_mlp": 0.01266609, + "balance_loss_clip": 1.14185905, + "balance_loss_mlp": 1.02037024, + "epoch": 0.5970059520230866, + "flos": 23694792415200.0, + "grad_norm": 2.7765694567932293, + "language_loss": 0.710334, + "learning_rate": 1.474966840842761e-06, + "loss": 0.73803753, + "num_input_tokens_seen": 107053355, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.45898438, + "step": 4965, + "time_per_iteration": 3.065190315246582 + }, + { + "auxiliary_loss_clip": 0.01513184, + "auxiliary_loss_mlp": 0.01276355, + "balance_loss_clip": 1.1521579, + "balance_loss_mlp": 1.03488469, + "epoch": 0.5971261949137258, + "flos": 23187801600000.0, + "grad_norm": 1.8268591776512708, + "language_loss": 0.87028182, + "learning_rate": 1.4742152295351655e-06, + "loss": 0.89817715, + "num_input_tokens_seen": 107072510, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.41015625, + "step": 4966, + "time_per_iteration": 3.048811674118042 + }, + { + "auxiliary_loss_clip": 0.01504117, + "auxiliary_loss_mlp": 0.01277019, + "balance_loss_clip": 1.14131498, + "balance_loss_mlp": 1.03211594, + "epoch": 0.5972464378043648, + "flos": 20559771090720.0, + "grad_norm": 2.539954970542716, + "language_loss": 0.63629735, + "learning_rate": 1.4734636979887016e-06, + "loss": 0.66410875, + "num_input_tokens_seen": 107089970, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.4453125, + "step": 4967, + "time_per_iteration": 3.155156135559082 + }, + { + "auxiliary_loss_clip": 0.0150781, + "auxiliary_loss_mlp": 0.01279737, + "balance_loss_clip": 1.14419878, + "balance_loss_mlp": 1.03368914, + "epoch": 0.5973666806950039, + "flos": 29389572832320.0, + "grad_norm": 28.402266630967464, + "language_loss": 0.90384281, + "learning_rate": 1.4727122463173755e-06, + "loss": 0.93171823, + "num_input_tokens_seen": 107108500, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.45703125, + "step": 4968, + "time_per_iteration": 3.1362721920013428 + }, + { + "auxiliary_loss_clip": 0.01509574, + "auxiliary_loss_mlp": 0.01267342, + "balance_loss_clip": 1.14785016, + "balance_loss_mlp": 1.02301097, + "epoch": 0.597486923585643, + "flos": 22275734028480.0, + "grad_norm": 1.8007311435363138, + "language_loss": 0.64413589, + "learning_rate": 1.471960874635183e-06, + "loss": 0.67190504, + "num_input_tokens_seen": 107128060, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.43945312, + "step": 4969, + "time_per_iteration": 2.9923627376556396 + }, + { + "auxiliary_loss_clip": 0.01508239, + "auxiliary_loss_mlp": 0.01279364, + "balance_loss_clip": 1.14563322, + "balance_loss_mlp": 1.03217161, + "epoch": 0.5976071664762821, + "flos": 13774203790560.0, + "grad_norm": 2.542768059754904, + "language_loss": 0.7120961, + "learning_rate": 1.4712095830561055e-06, + "loss": 0.73997211, + "num_input_tokens_seen": 107146550, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.46875, + "step": 4970, + "time_per_iteration": 3.0663466453552246 + }, + { + "auxiliary_loss_clip": 0.01506516, + "auxiliary_loss_mlp": 0.01265276, + "balance_loss_clip": 1.14455247, + "balance_loss_mlp": 1.02361488, + "epoch": 0.5977274093669211, + "flos": 19100622274560.0, + "grad_norm": 2.480133813790263, + "language_loss": 0.80650723, + "learning_rate": 1.4704583716941147e-06, + "loss": 0.83422518, + "num_input_tokens_seen": 107165415, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.41210938, + "step": 4971, + "time_per_iteration": 3.04057240486145 + }, + { + "auxiliary_loss_clip": 0.01512264, + "auxiliary_loss_mlp": 0.01271338, + "balance_loss_clip": 1.14881039, + "balance_loss_mlp": 1.02872336, + "epoch": 0.5978476522575603, + "flos": 20378093382720.0, + "grad_norm": 1.755962518058892, + "language_loss": 0.72587883, + "learning_rate": 1.4697072406631672e-06, + "loss": 0.7537148, + "num_input_tokens_seen": 107185320, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.421875, + "step": 4972, + "time_per_iteration": 3.1363916397094727 + }, + { + "auxiliary_loss_clip": 0.01508212, + "auxiliary_loss_mlp": 0.01276896, + "balance_loss_clip": 1.14684391, + "balance_loss_mlp": 1.03027618, + "epoch": 0.5979678951481994, + "flos": 29025724350240.0, + "grad_norm": 2.0239380719872493, + "language_loss": 0.72646558, + "learning_rate": 1.4689561900772097e-06, + "loss": 0.75431669, + "num_input_tokens_seen": 107205380, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.46289062, + "step": 4973, + "time_per_iteration": 3.0958399772644043 + }, + { + "auxiliary_loss_clip": 0.01501743, + "auxiliary_loss_mlp": 0.01267697, + "balance_loss_clip": 1.13870525, + "balance_loss_mlp": 1.02489209, + "epoch": 0.5980881380388384, + "flos": 17969631177600.0, + "grad_norm": 2.9206643421494873, + "language_loss": 0.72804809, + "learning_rate": 1.4682052200501758e-06, + "loss": 0.75574255, + "num_input_tokens_seen": 107222585, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.42382812, + "step": 4974, + "time_per_iteration": 3.0589213371276855 + }, + { + "auxiliary_loss_clip": 0.01501378, + "auxiliary_loss_mlp": 0.01274758, + "balance_loss_clip": 1.13947845, + "balance_loss_mlp": 1.03080833, + "epoch": 0.5982083809294776, + "flos": 22964857689600.0, + "grad_norm": 1.9362793259884499, + "language_loss": 0.80349159, + "learning_rate": 1.4674543306959876e-06, + "loss": 0.83125293, + "num_input_tokens_seen": 107242055, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.43554688, + "step": 4975, + "time_per_iteration": 3.0799264907836914 + }, + { + "auxiliary_loss_clip": 0.01507791, + "auxiliary_loss_mlp": 0.01274828, + "balance_loss_clip": 1.14574933, + "balance_loss_mlp": 1.02649164, + "epoch": 0.5983286238201166, + "flos": 20993825325600.0, + "grad_norm": 3.05406133221904, + "language_loss": 0.84801608, + "learning_rate": 1.4667035221285535e-06, + "loss": 0.87584227, + "num_input_tokens_seen": 107259695, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.48046875, + "step": 4976, + "time_per_iteration": 3.0801899433135986 + }, + { + "auxiliary_loss_clip": 0.01506998, + "auxiliary_loss_mlp": 0.01272161, + "balance_loss_clip": 1.14582968, + "balance_loss_mlp": 1.02592242, + "epoch": 0.5984488667107557, + "flos": 28185948580320.0, + "grad_norm": 2.2537317231562657, + "language_loss": 0.74350768, + "learning_rate": 1.4659527944617715e-06, + "loss": 0.77129936, + "num_input_tokens_seen": 107279640, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.45898438, + "step": 4977, + "time_per_iteration": 3.065629243850708 + }, + { + "auxiliary_loss_clip": 0.01501444, + "auxiliary_loss_mlp": 0.01263171, + "balance_loss_clip": 1.13882351, + "balance_loss_mlp": 1.02017522, + "epoch": 0.5985691096013949, + "flos": 16473653753760.0, + "grad_norm": 1.9764604446832035, + "language_loss": 0.76644343, + "learning_rate": 1.465202147809526e-06, + "loss": 0.79408956, + "num_input_tokens_seen": 107298135, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.42578125, + "step": 4978, + "time_per_iteration": 3.03069806098938 + }, + { + "auxiliary_loss_clip": 0.01510403, + "auxiliary_loss_mlp": 0.01261395, + "balance_loss_clip": 1.14936304, + "balance_loss_mlp": 1.01725435, + "epoch": 0.5986893524920339, + "flos": 26721186396480.0, + "grad_norm": 2.0334396860831077, + "language_loss": 0.76338565, + "learning_rate": 1.4644515822856888e-06, + "loss": 0.79110372, + "num_input_tokens_seen": 107316570, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.4375, + "step": 4979, + "time_per_iteration": 3.054234027862549 + }, + { + "auxiliary_loss_clip": 0.01500179, + "auxiliary_loss_mlp": 0.0120562, + "balance_loss_clip": 1.14415431, + "balance_loss_mlp": 1.00649261, + "epoch": 0.598809595382673, + "flos": 61614682958880.0, + "grad_norm": 0.7731042928404197, + "language_loss": 0.56520277, + "learning_rate": 1.4637010980041215e-06, + "loss": 0.59226078, + "num_input_tokens_seen": 107378680, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 1.98828125, + "step": 4980, + "time_per_iteration": 3.5524375438690186 + }, + { + "auxiliary_loss_clip": 0.01503482, + "auxiliary_loss_mlp": 0.01279755, + "balance_loss_clip": 1.14240026, + "balance_loss_mlp": 1.03332579, + "epoch": 0.5989298382733121, + "flos": 11803323139200.0, + "grad_norm": 2.267069951714518, + "language_loss": 0.897174, + "learning_rate": 1.4629506950786707e-06, + "loss": 0.92500639, + "num_input_tokens_seen": 107394860, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.4609375, + "step": 4981, + "time_per_iteration": 3.9399986267089844 + }, + { + "auxiliary_loss_clip": 0.01489106, + "auxiliary_loss_mlp": 0.01204376, + "balance_loss_clip": 1.13353407, + "balance_loss_mlp": 1.00524902, + "epoch": 0.5990500811639512, + "flos": 60031583822880.0, + "grad_norm": 0.8172656890785439, + "language_loss": 0.56042612, + "learning_rate": 1.4622003736231733e-06, + "loss": 0.58736092, + "num_input_tokens_seen": 107453850, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 1.984375, + "step": 4982, + "time_per_iteration": 3.443755865097046 + }, + { + "auxiliary_loss_clip": 0.01499673, + "auxiliary_loss_mlp": 0.01267944, + "balance_loss_clip": 1.1377871, + "balance_loss_mlp": 1.02113259, + "epoch": 0.5991703240545903, + "flos": 18224662675680.0, + "grad_norm": 1.8881489076409759, + "language_loss": 0.80822551, + "learning_rate": 1.461450133751451e-06, + "loss": 0.83590168, + "num_input_tokens_seen": 107471920, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.46484375, + "step": 4983, + "time_per_iteration": 3.1314902305603027 + }, + { + "auxiliary_loss_clip": 0.01508773, + "auxiliary_loss_mlp": 0.01264999, + "balance_loss_clip": 1.14551282, + "balance_loss_mlp": 1.01990509, + "epoch": 0.5992905669452293, + "flos": 27712410766560.0, + "grad_norm": 1.8571982690287019, + "language_loss": 0.76060081, + "learning_rate": 1.4606999755773153e-06, + "loss": 0.78833854, + "num_input_tokens_seen": 107493125, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.44726562, + "step": 4984, + "time_per_iteration": 3.054649591445923 + }, + { + "auxiliary_loss_clip": 0.01505679, + "auxiliary_loss_mlp": 0.01276509, + "balance_loss_clip": 1.1433537, + "balance_loss_mlp": 1.03179634, + "epoch": 0.5994108098358685, + "flos": 20451105819360.0, + "grad_norm": 1.7668296985964989, + "language_loss": 0.82476735, + "learning_rate": 1.4599498992145643e-06, + "loss": 0.85258925, + "num_input_tokens_seen": 107513150, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.44335938, + "step": 4985, + "time_per_iteration": 3.822126626968384 + }, + { + "auxiliary_loss_clip": 0.01503754, + "auxiliary_loss_mlp": 0.01271824, + "balance_loss_clip": 1.14016366, + "balance_loss_mlp": 1.02749252, + "epoch": 0.5995310527265075, + "flos": 22271979140640.0, + "grad_norm": 1.936288556716796, + "language_loss": 0.70304132, + "learning_rate": 1.4591999047769846e-06, + "loss": 0.73079711, + "num_input_tokens_seen": 107532005, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.43945312, + "step": 4986, + "time_per_iteration": 2.9706332683563232 + }, + { + "auxiliary_loss_clip": 0.01503238, + "auxiliary_loss_mlp": 0.01273839, + "balance_loss_clip": 1.14204049, + "balance_loss_mlp": 1.02817273, + "epoch": 0.5996512956171466, + "flos": 18918982494720.0, + "grad_norm": 1.9026614395355037, + "language_loss": 0.75481999, + "learning_rate": 1.4584499923783486e-06, + "loss": 0.78259075, + "num_input_tokens_seen": 107550585, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.453125, + "step": 4987, + "time_per_iteration": 3.0063083171844482 + }, + { + "auxiliary_loss_clip": 0.0149844, + "auxiliary_loss_mlp": 0.01276227, + "balance_loss_clip": 1.13506889, + "balance_loss_mlp": 1.03323066, + "epoch": 0.5997715385077858, + "flos": 15372360770400.0, + "grad_norm": 2.979084013502049, + "language_loss": 0.76099658, + "learning_rate": 1.457700162132419e-06, + "loss": 0.7887432, + "num_input_tokens_seen": 107567575, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.42578125, + "step": 4988, + "time_per_iteration": 3.0150351524353027 + }, + { + "auxiliary_loss_clip": 0.01497043, + "auxiliary_loss_mlp": 0.01263934, + "balance_loss_clip": 1.13450861, + "balance_loss_mlp": 1.0188396, + "epoch": 0.5998917813984248, + "flos": 25267195810080.0, + "grad_norm": 2.4840133411991627, + "language_loss": 0.72316849, + "learning_rate": 1.4569504141529433e-06, + "loss": 0.75077832, + "num_input_tokens_seen": 107585410, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.44726562, + "step": 4989, + "time_per_iteration": 4.010189771652222 + }, + { + "auxiliary_loss_clip": 0.01501565, + "auxiliary_loss_mlp": 0.01283347, + "balance_loss_clip": 1.1393013, + "balance_loss_mlp": 1.03653622, + "epoch": 0.6000120242890639, + "flos": 22056696718560.0, + "grad_norm": 2.7408107559090404, + "language_loss": 0.71772194, + "learning_rate": 1.456200748553658e-06, + "loss": 0.74557102, + "num_input_tokens_seen": 107603405, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.46484375, + "step": 4990, + "time_per_iteration": 3.074505567550659 + }, + { + "auxiliary_loss_clip": 0.01502621, + "auxiliary_loss_mlp": 0.01270974, + "balance_loss_clip": 1.13857841, + "balance_loss_mlp": 1.02473485, + "epoch": 0.600132267179703, + "flos": 29866106970720.0, + "grad_norm": 1.543722284256963, + "language_loss": 0.7879324, + "learning_rate": 1.455451165448287e-06, + "loss": 0.81566834, + "num_input_tokens_seen": 107626060, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.45898438, + "step": 4991, + "time_per_iteration": 3.148634910583496 + }, + { + "auxiliary_loss_clip": 0.01499278, + "auxiliary_loss_mlp": 0.01277253, + "balance_loss_clip": 1.13485277, + "balance_loss_mlp": 1.03234899, + "epoch": 0.6002525100703421, + "flos": 25047665434080.0, + "grad_norm": 2.9367683421304727, + "language_loss": 0.7414301, + "learning_rate": 1.4547016649505407e-06, + "loss": 0.76919544, + "num_input_tokens_seen": 107644070, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.4453125, + "step": 4992, + "time_per_iteration": 3.1926937103271484 + }, + { + "auxiliary_loss_clip": 0.01501159, + "auxiliary_loss_mlp": 0.01270938, + "balance_loss_clip": 1.13851523, + "balance_loss_mlp": 1.0258435, + "epoch": 0.6003727529609811, + "flos": 20851669124640.0, + "grad_norm": 2.5959581790261037, + "language_loss": 0.84899318, + "learning_rate": 1.4539522471741193e-06, + "loss": 0.87671423, + "num_input_tokens_seen": 107661495, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.44726562, + "step": 4993, + "time_per_iteration": 3.1555702686309814 + }, + { + "auxiliary_loss_clip": 0.01498782, + "auxiliary_loss_mlp": 0.01275542, + "balance_loss_clip": 1.13448179, + "balance_loss_mlp": 1.02453542, + "epoch": 0.6004929958516203, + "flos": 15596328741120.0, + "grad_norm": 2.024516155241953, + "language_loss": 0.70717692, + "learning_rate": 1.4532029122327067e-06, + "loss": 0.73492014, + "num_input_tokens_seen": 107678280, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.5078125, + "step": 4994, + "time_per_iteration": 3.077251672744751 + }, + { + "auxiliary_loss_clip": 0.01498089, + "auxiliary_loss_mlp": 0.01273351, + "balance_loss_clip": 1.13320303, + "balance_loss_mlp": 1.02921069, + "epoch": 0.6006132387422594, + "flos": 21765481391520.0, + "grad_norm": 2.20516356009639, + "language_loss": 0.75585496, + "learning_rate": 1.4524536602399783e-06, + "loss": 0.78356934, + "num_input_tokens_seen": 107697370, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.4375, + "step": 4995, + "time_per_iteration": 3.1089510917663574 + }, + { + "auxiliary_loss_clip": 0.01503554, + "auxiliary_loss_mlp": 0.01273617, + "balance_loss_clip": 1.1384728, + "balance_loss_mlp": 1.03004837, + "epoch": 0.6007334816328984, + "flos": 22860857581920.0, + "grad_norm": 1.7469802499474758, + "language_loss": 0.77560848, + "learning_rate": 1.4517044913095938e-06, + "loss": 0.80338019, + "num_input_tokens_seen": 107717790, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.43164062, + "step": 4996, + "time_per_iteration": 3.0230133533477783 + }, + { + "auxiliary_loss_clip": 0.01502573, + "auxiliary_loss_mlp": 0.01274015, + "balance_loss_clip": 1.1394937, + "balance_loss_mlp": 1.02625012, + "epoch": 0.6008537245235376, + "flos": 28326815223840.0, + "grad_norm": 2.163760068072316, + "language_loss": 0.81713575, + "learning_rate": 1.4509554055552022e-06, + "loss": 0.84490168, + "num_input_tokens_seen": 107738020, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.47460938, + "step": 4997, + "time_per_iteration": 3.0581140518188477 + }, + { + "auxiliary_loss_clip": 0.01503254, + "auxiliary_loss_mlp": 0.01278604, + "balance_loss_clip": 1.14029241, + "balance_loss_mlp": 1.03408217, + "epoch": 0.6009739674141766, + "flos": 20888194307040.0, + "grad_norm": 2.4976474269087747, + "language_loss": 0.8461051, + "learning_rate": 1.450206403090439e-06, + "loss": 0.87392366, + "num_input_tokens_seen": 107756215, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.44140625, + "step": 4998, + "time_per_iteration": 3.1306521892547607 + }, + { + "auxiliary_loss_clip": 0.01504188, + "auxiliary_loss_mlp": 0.0127387, + "balance_loss_clip": 1.13938785, + "balance_loss_mlp": 1.03163719, + "epoch": 0.6010942103048157, + "flos": 20482852053600.0, + "grad_norm": 2.1232202703633223, + "language_loss": 0.86526042, + "learning_rate": 1.4494574840289274e-06, + "loss": 0.89304101, + "num_input_tokens_seen": 107773330, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.41796875, + "step": 4999, + "time_per_iteration": 3.02163028717041 + }, + { + "auxiliary_loss_clip": 0.01499697, + "auxiliary_loss_mlp": 0.01271545, + "balance_loss_clip": 1.13602924, + "balance_loss_mlp": 1.02416158, + "epoch": 0.6012144531954549, + "flos": 23808274562880.0, + "grad_norm": 1.9163795145779963, + "language_loss": 0.73957664, + "learning_rate": 1.4487086484842782e-06, + "loss": 0.76728904, + "num_input_tokens_seen": 107791975, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.47070312, + "step": 5000, + "time_per_iteration": 3.0623347759246826 + }, + { + "auxiliary_loss_clip": 0.01500641, + "auxiliary_loss_mlp": 0.01265704, + "balance_loss_clip": 1.1376338, + "balance_loss_mlp": 1.02156341, + "epoch": 0.6013346960860939, + "flos": 18990326092320.0, + "grad_norm": 2.1907499242736654, + "language_loss": 0.60432315, + "learning_rate": 1.4479598965700878e-06, + "loss": 0.63198656, + "num_input_tokens_seen": 107809240, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.4375, + "step": 5001, + "time_per_iteration": 3.040414571762085 + }, + { + "auxiliary_loss_clip": 0.01504159, + "auxiliary_loss_mlp": 0.0127121, + "balance_loss_clip": 1.13999188, + "balance_loss_mlp": 1.02687883, + "epoch": 0.601454938976733, + "flos": 24027653226240.0, + "grad_norm": 3.4433950146667347, + "language_loss": 0.69261515, + "learning_rate": 1.4472112283999427e-06, + "loss": 0.72036886, + "num_input_tokens_seen": 107827895, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.43945312, + "step": 5002, + "time_per_iteration": 3.053696870803833 + }, + { + "auxiliary_loss_clip": 0.01496133, + "auxiliary_loss_mlp": 0.01256579, + "balance_loss_clip": 1.13272822, + "balance_loss_mlp": 1.01453674, + "epoch": 0.6015751818673721, + "flos": 26429250434400.0, + "grad_norm": 4.9647356999144865, + "language_loss": 0.69274783, + "learning_rate": 1.4464626440874143e-06, + "loss": 0.72027493, + "num_input_tokens_seen": 107847010, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.41601562, + "step": 5003, + "time_per_iteration": 3.0743918418884277 + }, + { + "auxiliary_loss_clip": 0.01501018, + "auxiliary_loss_mlp": 0.01271158, + "balance_loss_clip": 1.13673246, + "balance_loss_mlp": 1.02606392, + "epoch": 0.6016954247580112, + "flos": 13116940148160.0, + "grad_norm": 2.501020175451438, + "language_loss": 0.74585116, + "learning_rate": 1.4457141437460636e-06, + "loss": 0.77357292, + "num_input_tokens_seen": 107864235, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.44726562, + "step": 5004, + "time_per_iteration": 3.055663824081421 + }, + { + "auxiliary_loss_clip": 0.01500068, + "auxiliary_loss_mlp": 0.01271588, + "balance_loss_clip": 1.13571286, + "balance_loss_mlp": 1.02496755, + "epoch": 0.6018156676486502, + "flos": 23770687392000.0, + "grad_norm": 2.0801996529595352, + "language_loss": 0.73390949, + "learning_rate": 1.444965727489436e-06, + "loss": 0.76162601, + "num_input_tokens_seen": 107883680, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.46289062, + "step": 5005, + "time_per_iteration": 3.0655276775360107 + }, + { + "auxiliary_loss_clip": 0.01499268, + "auxiliary_loss_mlp": 0.01276953, + "balance_loss_clip": 1.13772154, + "balance_loss_mlp": 1.03071463, + "epoch": 0.6019359105392894, + "flos": 26471806194240.0, + "grad_norm": 1.7798534766430478, + "language_loss": 0.63247269, + "learning_rate": 1.444217395431066e-06, + "loss": 0.66023493, + "num_input_tokens_seen": 107906220, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.45898438, + "step": 5006, + "time_per_iteration": 3.1915040016174316 + }, + { + "auxiliary_loss_clip": 0.01495682, + "auxiliary_loss_mlp": 0.011987, + "balance_loss_clip": 1.13885927, + "balance_loss_mlp": 1.00186157, + "epoch": 0.6020561534299285, + "flos": 69197546125440.0, + "grad_norm": 0.8001046066702028, + "language_loss": 0.55820602, + "learning_rate": 1.4434691476844755e-06, + "loss": 0.58514988, + "num_input_tokens_seen": 107967195, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 1.96875, + "step": 5007, + "time_per_iteration": 3.5108695030212402 + }, + { + "auxiliary_loss_clip": 0.01502819, + "auxiliary_loss_mlp": 0.01269938, + "balance_loss_clip": 1.14009118, + "balance_loss_mlp": 1.02827728, + "epoch": 0.6021763963205675, + "flos": 21837469767840.0, + "grad_norm": 2.118465310637746, + "language_loss": 0.67094123, + "learning_rate": 1.4427209843631729e-06, + "loss": 0.69866878, + "num_input_tokens_seen": 107984245, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.41210938, + "step": 5008, + "time_per_iteration": 4.1539833545684814 + }, + { + "auxiliary_loss_clip": 0.0151037, + "auxiliary_loss_mlp": 0.01265788, + "balance_loss_clip": 1.14690173, + "balance_loss_mlp": 1.02298236, + "epoch": 0.6022966392112067, + "flos": 26581078316160.0, + "grad_norm": 2.3114660466055965, + "language_loss": 0.81256068, + "learning_rate": 1.4419729055806534e-06, + "loss": 0.84032226, + "num_input_tokens_seen": 108003680, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.42382812, + "step": 5009, + "time_per_iteration": 3.1197991371154785 + }, + { + "auxiliary_loss_clip": 0.01506362, + "auxiliary_loss_mlp": 0.01264509, + "balance_loss_clip": 1.14398694, + "balance_loss_mlp": 1.02094078, + "epoch": 0.6024168821018457, + "flos": 20705795964000.0, + "grad_norm": 3.2865411735994314, + "language_loss": 0.82180369, + "learning_rate": 1.441224911450401e-06, + "loss": 0.8495124, + "num_input_tokens_seen": 108019635, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.43164062, + "step": 5010, + "time_per_iteration": 3.0057694911956787 + }, + { + "auxiliary_loss_clip": 0.01511844, + "auxiliary_loss_mlp": 0.01276176, + "balance_loss_clip": 1.14938402, + "balance_loss_mlp": 1.02917445, + "epoch": 0.6025371249924848, + "flos": 24683285957760.0, + "grad_norm": 1.8589514567165086, + "language_loss": 0.8226527, + "learning_rate": 1.4404770020858851e-06, + "loss": 0.85053295, + "num_input_tokens_seen": 108039120, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.46679688, + "step": 5011, + "time_per_iteration": 3.015425443649292 + }, + { + "auxiliary_loss_clip": 0.01504178, + "auxiliary_loss_mlp": 0.01275223, + "balance_loss_clip": 1.1408931, + "balance_loss_mlp": 1.02917528, + "epoch": 0.602657367883124, + "flos": 25958064166560.0, + "grad_norm": 1.656206255942205, + "language_loss": 0.86088061, + "learning_rate": 1.439729177600563e-06, + "loss": 0.88867462, + "num_input_tokens_seen": 108059615, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.45703125, + "step": 5012, + "time_per_iteration": 3.8935678005218506 + }, + { + "auxiliary_loss_clip": 0.0150245, + "auxiliary_loss_mlp": 0.01277565, + "balance_loss_clip": 1.1412437, + "balance_loss_mlp": 1.0326618, + "epoch": 0.602777610773763, + "flos": 16692501422880.0, + "grad_norm": 2.1560547315335166, + "language_loss": 0.73345757, + "learning_rate": 1.4389814381078793e-06, + "loss": 0.76125777, + "num_input_tokens_seen": 108078855, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.4453125, + "step": 5013, + "time_per_iteration": 3.0170435905456543 + }, + { + "auxiliary_loss_clip": 0.01504407, + "auxiliary_loss_mlp": 0.01275839, + "balance_loss_clip": 1.1419971, + "balance_loss_mlp": 1.03227067, + "epoch": 0.6028978536644021, + "flos": 13335939529920.0, + "grad_norm": 2.070198760477864, + "language_loss": 0.80193126, + "learning_rate": 1.438233783721265e-06, + "loss": 0.82973367, + "num_input_tokens_seen": 108095020, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.43164062, + "step": 5014, + "time_per_iteration": 3.055161952972412 + }, + { + "auxiliary_loss_clip": 0.01503886, + "auxiliary_loss_mlp": 0.0127124, + "balance_loss_clip": 1.14050412, + "balance_loss_mlp": 1.02805328, + "epoch": 0.6030180965550412, + "flos": 19646793243360.0, + "grad_norm": 6.790446953422456, + "language_loss": 0.78025341, + "learning_rate": 1.43748621455414e-06, + "loss": 0.80800468, + "num_input_tokens_seen": 108111455, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.42773438, + "step": 5015, + "time_per_iteration": 3.1153533458709717 + }, + { + "auxiliary_loss_clip": 0.01500737, + "auxiliary_loss_mlp": 0.01274355, + "balance_loss_clip": 1.13681698, + "balance_loss_mlp": 1.02945161, + "epoch": 0.6031383394456803, + "flos": 14459762204640.0, + "grad_norm": 3.8837171223380103, + "language_loss": 0.80810225, + "learning_rate": 1.4367387307199082e-06, + "loss": 0.83585322, + "num_input_tokens_seen": 108128305, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.4453125, + "step": 5016, + "time_per_iteration": 3.069460153579712 + }, + { + "auxiliary_loss_clip": 0.01506136, + "auxiliary_loss_mlp": 0.0127813, + "balance_loss_clip": 1.14265871, + "balance_loss_mlp": 1.03112841, + "epoch": 0.6032585823363193, + "flos": 13919508028800.0, + "grad_norm": 4.796054706747649, + "language_loss": 0.82856429, + "learning_rate": 1.4359913323319632e-06, + "loss": 0.85640699, + "num_input_tokens_seen": 108145475, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.46679688, + "step": 5017, + "time_per_iteration": 4.088239669799805 + }, + { + "auxiliary_loss_clip": 0.01505413, + "auxiliary_loss_mlp": 0.01268622, + "balance_loss_clip": 1.14302754, + "balance_loss_mlp": 1.02486312, + "epoch": 0.6033788252269584, + "flos": 24355583376480.0, + "grad_norm": 1.8559628151750212, + "language_loss": 0.77967924, + "learning_rate": 1.4352440195036847e-06, + "loss": 0.8074196, + "num_input_tokens_seen": 108165650, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.43359375, + "step": 5018, + "time_per_iteration": 3.0979528427124023 + }, + { + "auxiliary_loss_clip": 0.01499178, + "auxiliary_loss_mlp": 0.01275715, + "balance_loss_clip": 1.13554931, + "balance_loss_mlp": 1.02909517, + "epoch": 0.6034990681175976, + "flos": 25523668578240.0, + "grad_norm": 1.5921335487914081, + "language_loss": 0.80060601, + "learning_rate": 1.4344967923484395e-06, + "loss": 0.82835495, + "num_input_tokens_seen": 108187620, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.46289062, + "step": 5019, + "time_per_iteration": 3.0033929347991943 + }, + { + "auxiliary_loss_clip": 0.01497712, + "auxiliary_loss_mlp": 0.01271132, + "balance_loss_clip": 1.13369846, + "balance_loss_mlp": 1.02756393, + "epoch": 0.6036193110082366, + "flos": 25960794994080.0, + "grad_norm": 2.3934690562337333, + "language_loss": 0.72336197, + "learning_rate": 1.433749650979581e-06, + "loss": 0.75105041, + "num_input_tokens_seen": 108207605, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.43164062, + "step": 5020, + "time_per_iteration": 3.0186400413513184 + }, + { + "auxiliary_loss_clip": 0.01500431, + "auxiliary_loss_mlp": 0.01277398, + "balance_loss_clip": 1.136482, + "balance_loss_mlp": 1.03268552, + "epoch": 0.6037395538988757, + "flos": 25595808667200.0, + "grad_norm": 2.715715356291097, + "language_loss": 0.67849386, + "learning_rate": 1.433002595510451e-06, + "loss": 0.70627213, + "num_input_tokens_seen": 108226385, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.44335938, + "step": 5021, + "time_per_iteration": 3.005037784576416 + }, + { + "auxiliary_loss_clip": 0.01499437, + "auxiliary_loss_mlp": 0.01281063, + "balance_loss_clip": 1.13626349, + "balance_loss_mlp": 1.0373044, + "epoch": 0.6038597967895148, + "flos": 17817879152160.0, + "grad_norm": 1.9000920669341856, + "language_loss": 0.71817493, + "learning_rate": 1.4322556260543757e-06, + "loss": 0.74597991, + "num_input_tokens_seen": 108242960, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.43359375, + "step": 5022, + "time_per_iteration": 2.9408082962036133 + }, + { + "auxiliary_loss_clip": 0.01490546, + "auxiliary_loss_mlp": 0.01200317, + "balance_loss_clip": 1.13378, + "balance_loss_mlp": 1.00614929, + "epoch": 0.6039800396801539, + "flos": 65175224682240.0, + "grad_norm": 0.8943683669012948, + "language_loss": 0.62671602, + "learning_rate": 1.4315087427246703e-06, + "loss": 0.65362465, + "num_input_tokens_seen": 108296785, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 1.94140625, + "step": 5023, + "time_per_iteration": 3.37268328666687 + }, + { + "auxiliary_loss_clip": 0.01489151, + "auxiliary_loss_mlp": 0.01202125, + "balance_loss_clip": 1.13261795, + "balance_loss_mlp": 1.00719452, + "epoch": 0.604100282570793, + "flos": 67392072001440.0, + "grad_norm": 0.8734899034002619, + "language_loss": 0.58423328, + "learning_rate": 1.4307619456346372e-06, + "loss": 0.61114609, + "num_input_tokens_seen": 108341090, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 1.94921875, + "step": 5024, + "time_per_iteration": 3.10994553565979 + }, + { + "auxiliary_loss_clip": 0.01498251, + "auxiliary_loss_mlp": 0.01276559, + "balance_loss_clip": 1.13419318, + "balance_loss_mlp": 1.03184605, + "epoch": 0.6042205254614321, + "flos": 35300356306560.0, + "grad_norm": 2.2131392788567643, + "language_loss": 0.74354619, + "learning_rate": 1.430015234897564e-06, + "loss": 0.77129436, + "num_input_tokens_seen": 108364370, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.44335938, + "step": 5025, + "time_per_iteration": 3.12097430229187 + }, + { + "auxiliary_loss_clip": 0.01498341, + "auxiliary_loss_mlp": 0.01273832, + "balance_loss_clip": 1.13359737, + "balance_loss_mlp": 1.03026354, + "epoch": 0.6043407683520712, + "flos": 45661636594080.0, + "grad_norm": 1.9524240125005592, + "language_loss": 0.66829002, + "learning_rate": 1.4292686106267274e-06, + "loss": 0.69601178, + "num_input_tokens_seen": 108387220, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.43164062, + "step": 5026, + "time_per_iteration": 3.1846840381622314 + }, + { + "auxiliary_loss_clip": 0.01502671, + "auxiliary_loss_mlp": 0.01269939, + "balance_loss_clip": 1.1386559, + "balance_loss_mlp": 1.02637076, + "epoch": 0.6044610112427102, + "flos": 16181869504320.0, + "grad_norm": 1.8507892734244376, + "language_loss": 0.77068943, + "learning_rate": 1.4285220729353876e-06, + "loss": 0.79841554, + "num_input_tokens_seen": 108405760, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.43164062, + "step": 5027, + "time_per_iteration": 3.0188493728637695 + }, + { + "auxiliary_loss_clip": 0.01496514, + "auxiliary_loss_mlp": 0.01264627, + "balance_loss_clip": 1.13243473, + "balance_loss_mlp": 1.0202961, + "epoch": 0.6045812541333494, + "flos": 13805532815040.0, + "grad_norm": 2.638910786148559, + "language_loss": 0.77926213, + "learning_rate": 1.4277756219367957e-06, + "loss": 0.8068735, + "num_input_tokens_seen": 108422785, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.43945312, + "step": 5028, + "time_per_iteration": 3.079646348953247 + }, + { + "auxiliary_loss_clip": 0.01503728, + "auxiliary_loss_mlp": 0.01262747, + "balance_loss_clip": 1.14006567, + "balance_loss_mlp": 1.01727128, + "epoch": 0.6047014970239885, + "flos": 19977150795840.0, + "grad_norm": 2.0904649236431823, + "language_loss": 0.79808211, + "learning_rate": 1.4270292577441864e-06, + "loss": 0.82574689, + "num_input_tokens_seen": 108442290, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.45117188, + "step": 5029, + "time_per_iteration": 3.1002848148345947 + }, + { + "auxiliary_loss_clip": 0.01503596, + "auxiliary_loss_mlp": 0.01277706, + "balance_loss_clip": 1.13935649, + "balance_loss_mlp": 1.02879763, + "epoch": 0.6048217399146275, + "flos": 25339411755360.0, + "grad_norm": 2.09828827341435, + "language_loss": 0.72066927, + "learning_rate": 1.4262829804707836e-06, + "loss": 0.74848235, + "num_input_tokens_seen": 108464280, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.48632812, + "step": 5030, + "time_per_iteration": 3.1495635509490967 + }, + { + "auxiliary_loss_clip": 0.01501242, + "auxiliary_loss_mlp": 0.01277736, + "balance_loss_clip": 1.13628316, + "balance_loss_mlp": 1.03130674, + "epoch": 0.6049419828052667, + "flos": 26033200580160.0, + "grad_norm": 1.4389201145512611, + "language_loss": 0.70076883, + "learning_rate": 1.4255367902297958e-06, + "loss": 0.72855866, + "num_input_tokens_seen": 108485610, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.4609375, + "step": 5031, + "time_per_iteration": 3.194000005722046 + }, + { + "auxiliary_loss_clip": 0.01505062, + "auxiliary_loss_mlp": 0.01272394, + "balance_loss_clip": 1.14192009, + "balance_loss_mlp": 1.02844429, + "epoch": 0.6050622256959057, + "flos": 14649897892320.0, + "grad_norm": 2.5756925623053153, + "language_loss": 0.78414339, + "learning_rate": 1.4247906871344215e-06, + "loss": 0.81191796, + "num_input_tokens_seen": 108501005, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.43554688, + "step": 5032, + "time_per_iteration": 3.008101224899292 + }, + { + "auxiliary_loss_clip": 0.01499494, + "auxiliary_loss_mlp": 0.01263169, + "balance_loss_clip": 1.13629031, + "balance_loss_mlp": 1.02093577, + "epoch": 0.6051824685865448, + "flos": 23333712688800.0, + "grad_norm": 2.268252870209542, + "language_loss": 0.74959087, + "learning_rate": 1.4240446712978415e-06, + "loss": 0.77721751, + "num_input_tokens_seen": 108519990, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.41796875, + "step": 5033, + "time_per_iteration": 3.075848340988159 + }, + { + "auxiliary_loss_clip": 0.01505761, + "auxiliary_loss_mlp": 0.01265681, + "balance_loss_clip": 1.14001441, + "balance_loss_mlp": 1.02249455, + "epoch": 0.605302711477184, + "flos": 27565779042720.0, + "grad_norm": 2.080262426046942, + "language_loss": 0.74178231, + "learning_rate": 1.423298742833227e-06, + "loss": 0.7694968, + "num_input_tokens_seen": 108538650, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.42773438, + "step": 5034, + "time_per_iteration": 3.0822665691375732 + }, + { + "auxiliary_loss_clip": 0.01499496, + "auxiliary_loss_mlp": 0.01266393, + "balance_loss_clip": 1.13558412, + "balance_loss_mlp": 1.02148938, + "epoch": 0.605422954367823, + "flos": 15156281856960.0, + "grad_norm": 2.765849804984618, + "language_loss": 0.71807992, + "learning_rate": 1.4225529018537352e-06, + "loss": 0.74573874, + "num_input_tokens_seen": 108554155, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.4453125, + "step": 5035, + "time_per_iteration": 4.031914472579956 + }, + { + "auxiliary_loss_clip": 0.01502486, + "auxiliary_loss_mlp": 0.01268172, + "balance_loss_clip": 1.13751125, + "balance_loss_mlp": 1.02403188, + "epoch": 0.6055431972584621, + "flos": 27676454506560.0, + "grad_norm": 2.237124288020339, + "language_loss": 0.7784164, + "learning_rate": 1.4218071484725082e-06, + "loss": 0.80612302, + "num_input_tokens_seen": 108576275, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.4375, + "step": 5036, + "time_per_iteration": 3.831449031829834 + }, + { + "auxiliary_loss_clip": 0.01504317, + "auxiliary_loss_mlp": 0.01270275, + "balance_loss_clip": 1.1405313, + "balance_loss_mlp": 1.02556229, + "epoch": 0.6056634401491012, + "flos": 19389334343040.0, + "grad_norm": 2.324283709488205, + "language_loss": 0.76216805, + "learning_rate": 1.4210614828026786e-06, + "loss": 0.78991401, + "num_input_tokens_seen": 108594125, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.44335938, + "step": 5037, + "time_per_iteration": 3.1728169918060303 + }, + { + "auxiliary_loss_clip": 0.01501592, + "auxiliary_loss_mlp": 0.0125896, + "balance_loss_clip": 1.13784456, + "balance_loss_mlp": 1.01596379, + "epoch": 0.6057836830397403, + "flos": 24791723660160.0, + "grad_norm": 1.5588933630145028, + "language_loss": 0.74584401, + "learning_rate": 1.4203159049573605e-06, + "loss": 0.77344954, + "num_input_tokens_seen": 108615360, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.42578125, + "step": 5038, + "time_per_iteration": 3.027906656265259 + }, + { + "auxiliary_loss_clip": 0.01503839, + "auxiliary_loss_mlp": 0.01273703, + "balance_loss_clip": 1.13935161, + "balance_loss_mlp": 1.0308975, + "epoch": 0.6059039259303793, + "flos": 20560340013120.0, + "grad_norm": 2.0779958688898525, + "language_loss": 0.87067467, + "learning_rate": 1.4195704150496593e-06, + "loss": 0.89845008, + "num_input_tokens_seen": 108633075, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.42382812, + "step": 5039, + "time_per_iteration": 3.831545352935791 + }, + { + "auxiliary_loss_clip": 0.01507369, + "auxiliary_loss_mlp": 0.01262933, + "balance_loss_clip": 1.14291406, + "balance_loss_mlp": 1.0212723, + "epoch": 0.6060241688210185, + "flos": 21071882207520.0, + "grad_norm": 1.8457841232586347, + "language_loss": 0.7408812, + "learning_rate": 1.4188250131926639e-06, + "loss": 0.76858419, + "num_input_tokens_seen": 108651875, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.41210938, + "step": 5040, + "time_per_iteration": 3.0747764110565186 + }, + { + "auxiliary_loss_clip": 0.01507527, + "auxiliary_loss_mlp": 0.012674, + "balance_loss_clip": 1.14281011, + "balance_loss_mlp": 1.02306819, + "epoch": 0.6061444117116576, + "flos": 16362864505440.0, + "grad_norm": 2.508324843650758, + "language_loss": 0.80714375, + "learning_rate": 1.4180796994994525e-06, + "loss": 0.83489305, + "num_input_tokens_seen": 108669290, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.43945312, + "step": 5041, + "time_per_iteration": 3.0012102127075195 + }, + { + "auxiliary_loss_clip": 0.01504538, + "auxiliary_loss_mlp": 0.01261806, + "balance_loss_clip": 1.14087045, + "balance_loss_mlp": 1.01976347, + "epoch": 0.6062646546022966, + "flos": 21509463761280.0, + "grad_norm": 2.013306931753902, + "language_loss": 0.72100943, + "learning_rate": 1.4173344740830877e-06, + "loss": 0.74867284, + "num_input_tokens_seen": 108688420, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.41601562, + "step": 5042, + "time_per_iteration": 3.0045907497406006 + }, + { + "auxiliary_loss_clip": 0.01512563, + "auxiliary_loss_mlp": 0.01267834, + "balance_loss_clip": 1.14849257, + "balance_loss_mlp": 1.02464724, + "epoch": 0.6063848974929358, + "flos": 38986251691680.0, + "grad_norm": 1.5641026058558498, + "language_loss": 0.70729733, + "learning_rate": 1.4165893370566206e-06, + "loss": 0.73510134, + "num_input_tokens_seen": 108712175, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.42773438, + "step": 5043, + "time_per_iteration": 3.1203982830047607 + }, + { + "auxiliary_loss_clip": 0.01503631, + "auxiliary_loss_mlp": 0.01263319, + "balance_loss_clip": 1.13815117, + "balance_loss_mlp": 1.0197506, + "epoch": 0.6065051403835748, + "flos": 19648651723200.0, + "grad_norm": 2.4713573580745485, + "language_loss": 0.77776593, + "learning_rate": 1.4158442885330865e-06, + "loss": 0.80543542, + "num_input_tokens_seen": 108730745, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 2.43164062, + "step": 5044, + "time_per_iteration": 2.9471993446350098 + }, + { + "auxiliary_loss_clip": 0.01512162, + "auxiliary_loss_mlp": 0.01266484, + "balance_loss_clip": 1.14798951, + "balance_loss_mlp": 1.02291536, + "epoch": 0.6066253832742139, + "flos": 23515314540480.0, + "grad_norm": 1.975291015935978, + "language_loss": 0.78811514, + "learning_rate": 1.4150993286255094e-06, + "loss": 0.81590152, + "num_input_tokens_seen": 108749995, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.43164062, + "step": 5045, + "time_per_iteration": 3.7740447521209717 + }, + { + "auxiliary_loss_clip": 0.01502517, + "auxiliary_loss_mlp": 0.01268697, + "balance_loss_clip": 1.13870978, + "balance_loss_mlp": 1.02665401, + "epoch": 0.6067456261648531, + "flos": 19135630330560.0, + "grad_norm": 3.818814130939172, + "language_loss": 0.79946482, + "learning_rate": 1.4143544574468993e-06, + "loss": 0.82717693, + "num_input_tokens_seen": 108768355, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.41601562, + "step": 5046, + "time_per_iteration": 2.9920387268066406 + }, + { + "auxiliary_loss_clip": 0.01508253, + "auxiliary_loss_mlp": 0.01264474, + "balance_loss_clip": 1.14484262, + "balance_loss_mlp": 1.01957059, + "epoch": 0.6068658690554921, + "flos": 20522639057760.0, + "grad_norm": 1.7416224864264447, + "language_loss": 0.82284772, + "learning_rate": 1.4136096751102523e-06, + "loss": 0.85057509, + "num_input_tokens_seen": 108786685, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.4453125, + "step": 5047, + "time_per_iteration": 2.922375202178955 + }, + { + "auxiliary_loss_clip": 0.01506284, + "auxiliary_loss_mlp": 0.01273889, + "balance_loss_clip": 1.14169073, + "balance_loss_mlp": 1.03070188, + "epoch": 0.6069861119461312, + "flos": 27376667415360.0, + "grad_norm": 2.7220558134866417, + "language_loss": 0.83317399, + "learning_rate": 1.4128649817285516e-06, + "loss": 0.86097574, + "num_input_tokens_seen": 108804820, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.42773438, + "step": 5048, + "time_per_iteration": 3.0090596675872803 + }, + { + "auxiliary_loss_clip": 0.01510146, + "auxiliary_loss_mlp": 0.01265107, + "balance_loss_clip": 1.14551091, + "balance_loss_mlp": 1.02459073, + "epoch": 0.6071063548367702, + "flos": 25628692746240.0, + "grad_norm": 7.058853311952929, + "language_loss": 0.63880664, + "learning_rate": 1.412120377414766e-06, + "loss": 0.66655916, + "num_input_tokens_seen": 108825010, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.40039062, + "step": 5049, + "time_per_iteration": 3.0426669120788574 + }, + { + "auxiliary_loss_clip": 0.01509376, + "auxiliary_loss_mlp": 0.01262992, + "balance_loss_clip": 1.1452527, + "balance_loss_mlp": 1.0213306, + "epoch": 0.6072265977274094, + "flos": 24462731521440.0, + "grad_norm": 2.1699476630727186, + "language_loss": 0.71249986, + "learning_rate": 1.4113758622818522e-06, + "loss": 0.74022353, + "num_input_tokens_seen": 108845075, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.41210938, + "step": 5050, + "time_per_iteration": 3.0673155784606934 + }, + { + "auxiliary_loss_clip": 0.01511171, + "auxiliary_loss_mlp": 0.01268303, + "balance_loss_clip": 1.14858186, + "balance_loss_mlp": 1.02873993, + "epoch": 0.6073468406180484, + "flos": 18151157172960.0, + "grad_norm": 2.0280411758903454, + "language_loss": 0.83485788, + "learning_rate": 1.410631436442751e-06, + "loss": 0.8626526, + "num_input_tokens_seen": 108863870, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.390625, + "step": 5051, + "time_per_iteration": 3.0094358921051025 + }, + { + "auxiliary_loss_clip": 0.01508355, + "auxiliary_loss_mlp": 0.01267613, + "balance_loss_clip": 1.14699233, + "balance_loss_mlp": 1.02518892, + "epoch": 0.6074670835086875, + "flos": 20699348176800.0, + "grad_norm": 2.0937034615418226, + "language_loss": 0.86759579, + "learning_rate": 1.4098871000103936e-06, + "loss": 0.89535546, + "num_input_tokens_seen": 108882470, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.41992188, + "step": 5052, + "time_per_iteration": 3.0445287227630615 + }, + { + "auxiliary_loss_clip": 0.01507208, + "auxiliary_loss_mlp": 0.01268876, + "balance_loss_clip": 1.14484322, + "balance_loss_mlp": 1.0264523, + "epoch": 0.6075873263993267, + "flos": 23772280374720.0, + "grad_norm": 1.8020098552506643, + "language_loss": 0.82732987, + "learning_rate": 1.409142853097693e-06, + "loss": 0.85509074, + "num_input_tokens_seen": 108902710, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.41992188, + "step": 5053, + "time_per_iteration": 3.091034412384033 + }, + { + "auxiliary_loss_clip": 0.01504733, + "auxiliary_loss_mlp": 0.01268523, + "balance_loss_clip": 1.14084232, + "balance_loss_mlp": 1.02457356, + "epoch": 0.6077075692899657, + "flos": 24456359590560.0, + "grad_norm": 2.71687264840294, + "language_loss": 0.79774064, + "learning_rate": 1.408398695817553e-06, + "loss": 0.82547319, + "num_input_tokens_seen": 108919935, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.43554688, + "step": 5054, + "time_per_iteration": 3.021313428878784 + }, + { + "auxiliary_loss_clip": 0.01507577, + "auxiliary_loss_mlp": 0.01267362, + "balance_loss_clip": 1.14463496, + "balance_loss_mlp": 1.02188611, + "epoch": 0.6078278121806048, + "flos": 27384632328960.0, + "grad_norm": 2.017701377641406, + "language_loss": 0.70282674, + "learning_rate": 1.4076546282828593e-06, + "loss": 0.73057616, + "num_input_tokens_seen": 108942790, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.45117188, + "step": 5055, + "time_per_iteration": 3.0379533767700195 + }, + { + "auxiliary_loss_clip": 0.01504332, + "auxiliary_loss_mlp": 0.01265491, + "balance_loss_clip": 1.1405884, + "balance_loss_mlp": 1.0244019, + "epoch": 0.6079480550712439, + "flos": 38439549728640.0, + "grad_norm": 2.6920296110408968, + "language_loss": 0.66237646, + "learning_rate": 1.4069106506064874e-06, + "loss": 0.69007468, + "num_input_tokens_seen": 108964215, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.40625, + "step": 5056, + "time_per_iteration": 3.247668743133545 + }, + { + "auxiliary_loss_clip": 0.01505802, + "auxiliary_loss_mlp": 0.01260812, + "balance_loss_clip": 1.14103544, + "balance_loss_mlp": 1.02182078, + "epoch": 0.608068297961883, + "flos": 25338539407680.0, + "grad_norm": 2.351224126946267, + "language_loss": 0.78184873, + "learning_rate": 1.4061667629012989e-06, + "loss": 0.80951482, + "num_input_tokens_seen": 108984885, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.38476562, + "step": 5057, + "time_per_iteration": 3.0213420391082764 + }, + { + "auxiliary_loss_clip": 0.01511494, + "auxiliary_loss_mlp": 0.01268834, + "balance_loss_clip": 1.14743721, + "balance_loss_mlp": 1.02908063, + "epoch": 0.608188540852522, + "flos": 24204514057920.0, + "grad_norm": 1.5781563600213184, + "language_loss": 0.8309046, + "learning_rate": 1.40542296528014e-06, + "loss": 0.8587079, + "num_input_tokens_seen": 109004545, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.39257812, + "step": 5058, + "time_per_iteration": 3.0110788345336914 + }, + { + "auxiliary_loss_clip": 0.01503617, + "auxiliary_loss_mlp": 0.0126893, + "balance_loss_clip": 1.14055634, + "balance_loss_mlp": 1.02421689, + "epoch": 0.6083087837431612, + "flos": 21286064712960.0, + "grad_norm": 2.0320772148678876, + "language_loss": 0.76117516, + "learning_rate": 1.4046792578558452e-06, + "loss": 0.78890061, + "num_input_tokens_seen": 109022440, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.44335938, + "step": 5059, + "time_per_iteration": 2.989846706390381 + }, + { + "auxiliary_loss_clip": 0.01505657, + "auxiliary_loss_mlp": 0.0126863, + "balance_loss_clip": 1.1418395, + "balance_loss_mlp": 1.02906692, + "epoch": 0.6084290266338003, + "flos": 16473274472160.0, + "grad_norm": 2.2564747264258855, + "language_loss": 0.75818717, + "learning_rate": 1.4039356407412325e-06, + "loss": 0.78593004, + "num_input_tokens_seen": 109035680, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.390625, + "step": 5060, + "time_per_iteration": 2.9015085697174072 + }, + { + "auxiliary_loss_clip": 0.01497699, + "auxiliary_loss_mlp": 0.01192406, + "balance_loss_clip": 1.14022624, + "balance_loss_mlp": 0.99823761, + "epoch": 0.6085492695244393, + "flos": 66449509824960.0, + "grad_norm": 0.785048436800187, + "language_loss": 0.5707885, + "learning_rate": 1.40319211404911e-06, + "loss": 0.59768963, + "num_input_tokens_seen": 109090680, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 1.94140625, + "step": 5061, + "time_per_iteration": 3.4751908779144287 + }, + { + "auxiliary_loss_clip": 0.01502335, + "auxiliary_loss_mlp": 0.01263242, + "balance_loss_clip": 1.13931918, + "balance_loss_mlp": 1.02348781, + "epoch": 0.6086695124150785, + "flos": 23621021415360.0, + "grad_norm": 19.012307290071988, + "language_loss": 0.90611839, + "learning_rate": 1.4024486778922691e-06, + "loss": 0.93377411, + "num_input_tokens_seen": 109108995, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.39257812, + "step": 5062, + "time_per_iteration": 3.005603551864624 + }, + { + "auxiliary_loss_clip": 0.01505517, + "auxiliary_loss_mlp": 0.01265828, + "balance_loss_clip": 1.14235818, + "balance_loss_mlp": 1.02416694, + "epoch": 0.6087897553057176, + "flos": 20159131929120.0, + "grad_norm": 2.163158917879996, + "language_loss": 0.77579528, + "learning_rate": 1.4017053323834884e-06, + "loss": 0.80350876, + "num_input_tokens_seen": 109128825, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.41210938, + "step": 5063, + "time_per_iteration": 4.685501575469971 + }, + { + "auxiliary_loss_clip": 0.01504342, + "auxiliary_loss_mlp": 0.01266601, + "balance_loss_clip": 1.14191747, + "balance_loss_mlp": 1.02627492, + "epoch": 0.6089099981963566, + "flos": 25484829778080.0, + "grad_norm": 3.316522373233392, + "language_loss": 0.7607066, + "learning_rate": 1.4009620776355333e-06, + "loss": 0.78841603, + "num_input_tokens_seen": 109150425, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.3984375, + "step": 5064, + "time_per_iteration": 3.1003942489624023 + }, + { + "auxiliary_loss_clip": 0.01504079, + "auxiliary_loss_mlp": 0.01270264, + "balance_loss_clip": 1.14206719, + "balance_loss_mlp": 1.02822149, + "epoch": 0.6090302410869958, + "flos": 25334974160640.0, + "grad_norm": 1.77300956041934, + "language_loss": 0.79313082, + "learning_rate": 1.4002189137611553e-06, + "loss": 0.82087427, + "num_input_tokens_seen": 109169765, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.41601562, + "step": 5065, + "time_per_iteration": 3.1000730991363525 + }, + { + "auxiliary_loss_clip": 0.01507924, + "auxiliary_loss_mlp": 0.01267786, + "balance_loss_clip": 1.14511228, + "balance_loss_mlp": 1.02650619, + "epoch": 0.6091504839776348, + "flos": 23989610917440.0, + "grad_norm": 1.7638981691780757, + "language_loss": 0.69802368, + "learning_rate": 1.3994758408730901e-06, + "loss": 0.72578084, + "num_input_tokens_seen": 109188950, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.40820312, + "step": 5066, + "time_per_iteration": 3.0951883792877197 + }, + { + "auxiliary_loss_clip": 0.01504131, + "auxiliary_loss_mlp": 0.01275083, + "balance_loss_clip": 1.14133763, + "balance_loss_mlp": 1.0296073, + "epoch": 0.6092707268682739, + "flos": 29645628390720.0, + "grad_norm": 2.6126615594762943, + "language_loss": 0.76697433, + "learning_rate": 1.3987328590840629e-06, + "loss": 0.79476643, + "num_input_tokens_seen": 109209895, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.45117188, + "step": 5067, + "time_per_iteration": 3.9337406158447266 + }, + { + "auxiliary_loss_clip": 0.01503418, + "auxiliary_loss_mlp": 0.01268386, + "balance_loss_clip": 1.14082801, + "balance_loss_mlp": 1.02634323, + "epoch": 0.609390969758913, + "flos": 24027273944640.0, + "grad_norm": 1.8162492325961475, + "language_loss": 0.86560369, + "learning_rate": 1.397989968506783e-06, + "loss": 0.89332169, + "num_input_tokens_seen": 109228905, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.41601562, + "step": 5068, + "time_per_iteration": 3.0422441959381104 + }, + { + "auxiliary_loss_clip": 0.01507515, + "auxiliary_loss_mlp": 0.01275069, + "balance_loss_clip": 1.14574003, + "balance_loss_mlp": 1.03054702, + "epoch": 0.6095112126495521, + "flos": 11103352024320.0, + "grad_norm": 2.6239381105569373, + "language_loss": 0.72395688, + "learning_rate": 1.3972471692539458e-06, + "loss": 0.75178272, + "num_input_tokens_seen": 109243620, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.44140625, + "step": 5069, + "time_per_iteration": 2.997765064239502 + }, + { + "auxiliary_loss_clip": 0.01501601, + "auxiliary_loss_mlp": 0.01273555, + "balance_loss_clip": 1.13860643, + "balance_loss_mlp": 1.03341985, + "epoch": 0.6096314555401912, + "flos": 17266891307040.0, + "grad_norm": 1.938067485668388, + "language_loss": 0.75071329, + "learning_rate": 1.3965044614382348e-06, + "loss": 0.77846491, + "num_input_tokens_seen": 109259070, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.39648438, + "step": 5070, + "time_per_iteration": 3.0303115844726562 + }, + { + "auxiliary_loss_clip": 0.0151093, + "auxiliary_loss_mlp": 0.01278988, + "balance_loss_clip": 1.14783227, + "balance_loss_mlp": 1.03503776, + "epoch": 0.6097516984308303, + "flos": 21647675433600.0, + "grad_norm": 2.5493523540283043, + "language_loss": 0.75687623, + "learning_rate": 1.3957618451723162e-06, + "loss": 0.78477538, + "num_input_tokens_seen": 109275100, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.43554688, + "step": 5071, + "time_per_iteration": 2.992889404296875 + }, + { + "auxiliary_loss_clip": 0.01505916, + "auxiliary_loss_mlp": 0.01276179, + "balance_loss_clip": 1.14429843, + "balance_loss_mlp": 1.03127599, + "epoch": 0.6098719413214694, + "flos": 27201702991680.0, + "grad_norm": 1.9892401707405454, + "language_loss": 0.71514213, + "learning_rate": 1.3950193205688457e-06, + "loss": 0.74296314, + "num_input_tokens_seen": 109294825, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.4453125, + "step": 5072, + "time_per_iteration": 3.9008383750915527 + }, + { + "auxiliary_loss_clip": 0.01508577, + "auxiliary_loss_mlp": 0.01270602, + "balance_loss_clip": 1.14661324, + "balance_loss_mlp": 1.02874994, + "epoch": 0.6099921842121084, + "flos": 20415073703040.0, + "grad_norm": 1.9387776893990376, + "language_loss": 0.83753037, + "learning_rate": 1.3942768877404627e-06, + "loss": 0.86532211, + "num_input_tokens_seen": 109313790, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.4140625, + "step": 5073, + "time_per_iteration": 3.095201015472412 + }, + { + "auxiliary_loss_clip": 0.0150395, + "auxiliary_loss_mlp": 0.01263336, + "balance_loss_clip": 1.14218044, + "balance_loss_mlp": 1.02205658, + "epoch": 0.6101124271027476, + "flos": 23368303535040.0, + "grad_norm": 5.429991905988059, + "language_loss": 0.73593754, + "learning_rate": 1.393534546799795e-06, + "loss": 0.76361036, + "num_input_tokens_seen": 109333490, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.40820312, + "step": 5074, + "time_per_iteration": 3.1960995197296143 + }, + { + "auxiliary_loss_clip": 0.01510331, + "auxiliary_loss_mlp": 0.01277789, + "balance_loss_clip": 1.14683056, + "balance_loss_mlp": 1.03650975, + "epoch": 0.6102326699933867, + "flos": 26690274581760.0, + "grad_norm": 1.6791797485179931, + "language_loss": 0.67914587, + "learning_rate": 1.3927922978594536e-06, + "loss": 0.70702708, + "num_input_tokens_seen": 109354575, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.40820312, + "step": 5075, + "time_per_iteration": 3.1380131244659424 + }, + { + "auxiliary_loss_clip": 0.01501237, + "auxiliary_loss_mlp": 0.01214096, + "balance_loss_clip": 1.14487338, + "balance_loss_mlp": 1.01802063, + "epoch": 0.6103529128840257, + "flos": 60650350018560.0, + "grad_norm": 0.8171315332398288, + "language_loss": 0.57414824, + "learning_rate": 1.3920501410320387e-06, + "loss": 0.60130155, + "num_input_tokens_seen": 109410690, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 1.95703125, + "step": 5076, + "time_per_iteration": 3.442981481552124 + }, + { + "auxiliary_loss_clip": 0.01509734, + "auxiliary_loss_mlp": 0.01267175, + "balance_loss_clip": 1.14790154, + "balance_loss_mlp": 1.0228436, + "epoch": 0.6104731557746649, + "flos": 19023058458720.0, + "grad_norm": 2.316592427825861, + "language_loss": 0.76437235, + "learning_rate": 1.3913080764301333e-06, + "loss": 0.79214144, + "num_input_tokens_seen": 109427650, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.43945312, + "step": 5077, + "time_per_iteration": 2.9588465690612793 + }, + { + "auxiliary_loss_clip": 0.01509143, + "auxiliary_loss_mlp": 0.01272933, + "balance_loss_clip": 1.14634383, + "balance_loss_mlp": 1.02879262, + "epoch": 0.6105933986653039, + "flos": 23369100026400.0, + "grad_norm": 1.770239547124388, + "language_loss": 0.71196187, + "learning_rate": 1.3905661041663085e-06, + "loss": 0.73978263, + "num_input_tokens_seen": 109448835, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.4375, + "step": 5078, + "time_per_iteration": 3.023007392883301 + }, + { + "auxiliary_loss_clip": 0.01507697, + "auxiliary_loss_mlp": 0.01277142, + "balance_loss_clip": 1.14489079, + "balance_loss_mlp": 1.03300095, + "epoch": 0.610713641555943, + "flos": 34640210124000.0, + "grad_norm": 2.2768404739831944, + "language_loss": 0.65176511, + "learning_rate": 1.389824224353122e-06, + "loss": 0.67961347, + "num_input_tokens_seen": 109470425, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.4375, + "step": 5079, + "time_per_iteration": 3.093057155609131 + }, + { + "auxiliary_loss_clip": 0.01506506, + "auxiliary_loss_mlp": 0.01273646, + "balance_loss_clip": 1.14299321, + "balance_loss_mlp": 1.03198493, + "epoch": 0.610833884446582, + "flos": 26648629097760.0, + "grad_norm": 1.8930051907501595, + "language_loss": 0.76770246, + "learning_rate": 1.389082437103115e-06, + "loss": 0.79550397, + "num_input_tokens_seen": 109489695, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.41210938, + "step": 5080, + "time_per_iteration": 3.0296010971069336 + }, + { + "auxiliary_loss_clip": 0.01515547, + "auxiliary_loss_mlp": 0.01271047, + "balance_loss_clip": 1.15401471, + "balance_loss_mlp": 1.02824104, + "epoch": 0.6109541273372212, + "flos": 21217262302080.0, + "grad_norm": 1.9938275675318446, + "language_loss": 0.77809685, + "learning_rate": 1.3883407425288172e-06, + "loss": 0.8059628, + "num_input_tokens_seen": 109510030, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.42382812, + "step": 5081, + "time_per_iteration": 3.032606840133667 + }, + { + "auxiliary_loss_clip": 0.01505424, + "auxiliary_loss_mlp": 0.01276014, + "balance_loss_clip": 1.14270639, + "balance_loss_mlp": 1.03301764, + "epoch": 0.6110743702278603, + "flos": 20086157420640.0, + "grad_norm": 2.79242646019495, + "language_loss": 0.79941285, + "learning_rate": 1.3875991407427417e-06, + "loss": 0.82722718, + "num_input_tokens_seen": 109528255, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.42578125, + "step": 5082, + "time_per_iteration": 2.9977378845214844 + }, + { + "auxiliary_loss_clip": 0.01500161, + "auxiliary_loss_mlp": 0.0120446, + "balance_loss_clip": 1.1433382, + "balance_loss_mlp": 1.00991058, + "epoch": 0.6111946131184993, + "flos": 68308918521120.0, + "grad_norm": 0.7733382336377814, + "language_loss": 0.58152515, + "learning_rate": 1.38685763185739e-06, + "loss": 0.60857135, + "num_input_tokens_seen": 109581915, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 1.9453125, + "step": 5083, + "time_per_iteration": 3.5326855182647705 + }, + { + "auxiliary_loss_clip": 0.01505637, + "auxiliary_loss_mlp": 0.01272645, + "balance_loss_clip": 1.14303625, + "balance_loss_mlp": 1.03098369, + "epoch": 0.6113148560091385, + "flos": 19939715337600.0, + "grad_norm": 2.6207346979273316, + "language_loss": 0.67806047, + "learning_rate": 1.3861162159852476e-06, + "loss": 0.70584327, + "num_input_tokens_seen": 109600050, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.41210938, + "step": 5084, + "time_per_iteration": 3.0414175987243652 + }, + { + "auxiliary_loss_clip": 0.01504317, + "auxiliary_loss_mlp": 0.01285764, + "balance_loss_clip": 1.14190984, + "balance_loss_mlp": 1.04219556, + "epoch": 0.6114350988997775, + "flos": 23734086353280.0, + "grad_norm": 2.030554956410865, + "language_loss": 0.79914695, + "learning_rate": 1.3853748932387875e-06, + "loss": 0.82704771, + "num_input_tokens_seen": 109620690, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.43164062, + "step": 5085, + "time_per_iteration": 3.080876350402832 + }, + { + "auxiliary_loss_clip": 0.01511644, + "auxiliary_loss_mlp": 0.01271089, + "balance_loss_clip": 1.15022421, + "balance_loss_mlp": 1.02599514, + "epoch": 0.6115553417904166, + "flos": 24025946459040.0, + "grad_norm": 3.835997269613516, + "language_loss": 0.75242305, + "learning_rate": 1.3846336637304671e-06, + "loss": 0.78025043, + "num_input_tokens_seen": 109638960, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.44726562, + "step": 5086, + "time_per_iteration": 3.076124429702759 + }, + { + "auxiliary_loss_clip": 0.01504469, + "auxiliary_loss_mlp": 0.01264487, + "balance_loss_clip": 1.14108706, + "balance_loss_mlp": 1.02168202, + "epoch": 0.6116755846810558, + "flos": 23735868976800.0, + "grad_norm": 2.1546237151345045, + "language_loss": 0.83654928, + "learning_rate": 1.3838925275727316e-06, + "loss": 0.8642388, + "num_input_tokens_seen": 109659700, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.42382812, + "step": 5087, + "time_per_iteration": 3.1011929512023926 + }, + { + "auxiliary_loss_clip": 0.01510071, + "auxiliary_loss_mlp": 0.01264807, + "balance_loss_clip": 1.14786196, + "balance_loss_mlp": 1.02066612, + "epoch": 0.6117958275716948, + "flos": 18663647571360.0, + "grad_norm": 2.005610167102805, + "language_loss": 0.79303122, + "learning_rate": 1.3831514848780089e-06, + "loss": 0.82077992, + "num_input_tokens_seen": 109679275, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.4375, + "step": 5088, + "time_per_iteration": 3.038510799407959 + }, + { + "auxiliary_loss_clip": 0.01510934, + "auxiliary_loss_mlp": 0.01264238, + "balance_loss_clip": 1.14980769, + "balance_loss_mlp": 1.02353096, + "epoch": 0.6119160704623339, + "flos": 16473008975040.0, + "grad_norm": 2.437980939531307, + "language_loss": 0.91738009, + "learning_rate": 1.3824105357587152e-06, + "loss": 0.94513178, + "num_input_tokens_seen": 109696380, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.40234375, + "step": 5089, + "time_per_iteration": 3.021148681640625 + }, + { + "auxiliary_loss_clip": 0.01503999, + "auxiliary_loss_mlp": 0.01267092, + "balance_loss_clip": 1.14213634, + "balance_loss_mlp": 1.02428627, + "epoch": 0.612036313352973, + "flos": 23917774253760.0, + "grad_norm": 1.5487938988635312, + "language_loss": 0.82775986, + "learning_rate": 1.381669680327253e-06, + "loss": 0.85547078, + "num_input_tokens_seen": 109718060, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.42382812, + "step": 5090, + "time_per_iteration": 4.749905109405518 + }, + { + "auxiliary_loss_clip": 0.01512704, + "auxiliary_loss_mlp": 0.01269339, + "balance_loss_clip": 1.15074706, + "balance_loss_mlp": 1.02844119, + "epoch": 0.6121565562436121, + "flos": 26976938529600.0, + "grad_norm": 2.277974606739741, + "language_loss": 0.71186221, + "learning_rate": 1.380928918696008e-06, + "loss": 0.73968267, + "num_input_tokens_seen": 109736830, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.40429688, + "step": 5091, + "time_per_iteration": 3.0368268489837646 + }, + { + "auxiliary_loss_clip": 0.01496948, + "auxiliary_loss_mlp": 0.01274856, + "balance_loss_clip": 1.13393879, + "balance_loss_mlp": 1.0316689, + "epoch": 0.6122767991342511, + "flos": 15670668663360.0, + "grad_norm": 2.5272120459705048, + "language_loss": 0.7173444, + "learning_rate": 1.3801882509773548e-06, + "loss": 0.74506235, + "num_input_tokens_seen": 109754690, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.42773438, + "step": 5092, + "time_per_iteration": 3.0471553802490234 + }, + { + "auxiliary_loss_clip": 0.0149982, + "auxiliary_loss_mlp": 0.01267201, + "balance_loss_clip": 1.13757777, + "balance_loss_mlp": 1.02687502, + "epoch": 0.6123970420248903, + "flos": 27966797485920.0, + "grad_norm": 1.817502201825908, + "language_loss": 0.81837279, + "learning_rate": 1.3794476772836503e-06, + "loss": 0.84604299, + "num_input_tokens_seen": 109775790, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.3984375, + "step": 5093, + "time_per_iteration": 3.0691912174224854 + }, + { + "auxiliary_loss_clip": 0.01504392, + "auxiliary_loss_mlp": 0.01269457, + "balance_loss_clip": 1.14268959, + "balance_loss_mlp": 1.02493477, + "epoch": 0.6125172849155294, + "flos": 21470776673760.0, + "grad_norm": 1.6896484215212604, + "language_loss": 0.8435216, + "learning_rate": 1.3787071977272402e-06, + "loss": 0.87126017, + "num_input_tokens_seen": 109795050, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.44140625, + "step": 5094, + "time_per_iteration": 3.855299949645996 + }, + { + "auxiliary_loss_clip": 0.01502817, + "auxiliary_loss_mlp": 0.0127003, + "balance_loss_clip": 1.14063072, + "balance_loss_mlp": 1.02417302, + "epoch": 0.6126375278061684, + "flos": 16249913352000.0, + "grad_norm": 3.083702126938305, + "language_loss": 0.71580923, + "learning_rate": 1.3779668124204535e-06, + "loss": 0.74353766, + "num_input_tokens_seen": 109811465, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.45507812, + "step": 5095, + "time_per_iteration": 3.018068790435791 + }, + { + "auxiliary_loss_clip": 0.01504014, + "auxiliary_loss_mlp": 0.01269725, + "balance_loss_clip": 1.14208269, + "balance_loss_mlp": 1.03016162, + "epoch": 0.6127577706968076, + "flos": 20450954106720.0, + "grad_norm": 1.9296456280347125, + "language_loss": 0.81073308, + "learning_rate": 1.3772265214756074e-06, + "loss": 0.83847046, + "num_input_tokens_seen": 109831225, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.390625, + "step": 5096, + "time_per_iteration": 3.0511462688446045 + }, + { + "auxiliary_loss_clip": 0.01502514, + "auxiliary_loss_mlp": 0.01266282, + "balance_loss_clip": 1.14015484, + "balance_loss_mlp": 1.02538371, + "epoch": 0.6128780135874466, + "flos": 18262401559200.0, + "grad_norm": 2.026118195120665, + "language_loss": 0.75076723, + "learning_rate": 1.3764863250050025e-06, + "loss": 0.77845526, + "num_input_tokens_seen": 109849465, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.40429688, + "step": 5097, + "time_per_iteration": 3.031942129135132 + }, + { + "auxiliary_loss_clip": 0.01504741, + "auxiliary_loss_mlp": 0.01270369, + "balance_loss_clip": 1.1433605, + "balance_loss_mlp": 1.02908897, + "epoch": 0.6129982564780857, + "flos": 24939455300640.0, + "grad_norm": 2.156533219657017, + "language_loss": 0.80626702, + "learning_rate": 1.3757462231209272e-06, + "loss": 0.83401811, + "num_input_tokens_seen": 109869770, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.40820312, + "step": 5098, + "time_per_iteration": 3.15087628364563 + }, + { + "auxiliary_loss_clip": 0.01498403, + "auxiliary_loss_mlp": 0.01277011, + "balance_loss_clip": 1.13630891, + "balance_loss_mlp": 1.03515947, + "epoch": 0.6131184993687249, + "flos": 22494240344160.0, + "grad_norm": 1.9488348459994993, + "language_loss": 0.88848561, + "learning_rate": 1.3750062159356525e-06, + "loss": 0.91623974, + "num_input_tokens_seen": 109889120, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.4140625, + "step": 5099, + "time_per_iteration": 3.860781669616699 + }, + { + "auxiliary_loss_clip": 0.01502532, + "auxiliary_loss_mlp": 0.01268354, + "balance_loss_clip": 1.14063787, + "balance_loss_mlp": 1.02802813, + "epoch": 0.6132387422593639, + "flos": 15887544068160.0, + "grad_norm": 1.888118752843958, + "language_loss": 0.83199179, + "learning_rate": 1.3742663035614382e-06, + "loss": 0.85970068, + "num_input_tokens_seen": 109906490, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.3984375, + "step": 5100, + "time_per_iteration": 2.9319956302642822 + }, + { + "auxiliary_loss_clip": 0.01504303, + "auxiliary_loss_mlp": 0.01277611, + "balance_loss_clip": 1.14273334, + "balance_loss_mlp": 1.0348053, + "epoch": 0.613358985150003, + "flos": 25413941318400.0, + "grad_norm": 1.782986355461579, + "language_loss": 0.80079073, + "learning_rate": 1.3735264861105283e-06, + "loss": 0.82860982, + "num_input_tokens_seen": 109927130, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.42382812, + "step": 5101, + "time_per_iteration": 3.0917599201202393 + }, + { + "auxiliary_loss_clip": 0.01496545, + "auxiliary_loss_mlp": 0.01263811, + "balance_loss_clip": 1.1340214, + "balance_loss_mlp": 1.02386665, + "epoch": 0.6134792280406421, + "flos": 21363438888000.0, + "grad_norm": 4.459572891565791, + "language_loss": 0.78390288, + "learning_rate": 1.372786763695152e-06, + "loss": 0.81150645, + "num_input_tokens_seen": 109945890, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.39453125, + "step": 5102, + "time_per_iteration": 2.9527108669281006 + }, + { + "auxiliary_loss_clip": 0.01503687, + "auxiliary_loss_mlp": 0.01273351, + "balance_loss_clip": 1.14206052, + "balance_loss_mlp": 1.02844739, + "epoch": 0.6135994709312812, + "flos": 21213166060800.0, + "grad_norm": 2.8727705479595658, + "language_loss": 0.77765203, + "learning_rate": 1.3720471364275257e-06, + "loss": 0.80542231, + "num_input_tokens_seen": 109965535, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.4453125, + "step": 5103, + "time_per_iteration": 3.0105197429656982 + }, + { + "auxiliary_loss_clip": 0.01500526, + "auxiliary_loss_mlp": 0.01276651, + "balance_loss_clip": 1.13841963, + "balance_loss_mlp": 1.03346407, + "epoch": 0.6137197138219203, + "flos": 14795846909280.0, + "grad_norm": 2.6032108224777124, + "language_loss": 0.77837688, + "learning_rate": 1.3713076044198486e-06, + "loss": 0.80614865, + "num_input_tokens_seen": 109982345, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.42773438, + "step": 5104, + "time_per_iteration": 3.0778069496154785 + }, + { + "auxiliary_loss_clip": 0.01506455, + "auxiliary_loss_mlp": 0.01266226, + "balance_loss_clip": 1.14465237, + "balance_loss_mlp": 1.02284849, + "epoch": 0.6138399567125594, + "flos": 20086612558560.0, + "grad_norm": 2.416403947803648, + "language_loss": 0.81176078, + "learning_rate": 1.3705681677843086e-06, + "loss": 0.83948755, + "num_input_tokens_seen": 110000940, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.4296875, + "step": 5105, + "time_per_iteration": 3.026395082473755 + }, + { + "auxiliary_loss_clip": 0.01487112, + "auxiliary_loss_mlp": 0.01202652, + "balance_loss_clip": 1.13130689, + "balance_loss_mlp": 1.00352478, + "epoch": 0.6139601996031985, + "flos": 60131032187040.0, + "grad_norm": 0.7798913435989795, + "language_loss": 0.60512841, + "learning_rate": 1.3698288266330768e-06, + "loss": 0.63202608, + "num_input_tokens_seen": 110061565, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 1.984375, + "step": 5106, + "time_per_iteration": 3.5770561695098877 + }, + { + "auxiliary_loss_clip": 0.01509687, + "auxiliary_loss_mlp": 0.01270672, + "balance_loss_clip": 1.14895403, + "balance_loss_mlp": 1.02843904, + "epoch": 0.6140804424938375, + "flos": 23588971755840.0, + "grad_norm": 3.193274133873475, + "language_loss": 0.72812384, + "learning_rate": 1.3690895810783113e-06, + "loss": 0.75592744, + "num_input_tokens_seen": 110080360, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.41796875, + "step": 5107, + "time_per_iteration": 3.1217033863067627 + }, + { + "auxiliary_loss_clip": 0.01497283, + "auxiliary_loss_mlp": 0.0126962, + "balance_loss_clip": 1.13544345, + "balance_loss_mlp": 1.02795863, + "epoch": 0.6142006853844767, + "flos": 21400381280160.0, + "grad_norm": 2.298241035693338, + "language_loss": 0.71583569, + "learning_rate": 1.3683504312321543e-06, + "loss": 0.7435047, + "num_input_tokens_seen": 110100695, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.41210938, + "step": 5108, + "time_per_iteration": 3.0943949222564697 + }, + { + "auxiliary_loss_clip": 0.01503513, + "auxiliary_loss_mlp": 0.01268687, + "balance_loss_clip": 1.14070535, + "balance_loss_mlp": 1.02511871, + "epoch": 0.6143209282751158, + "flos": 12058999416000.0, + "grad_norm": 2.1201420994976745, + "language_loss": 0.79882371, + "learning_rate": 1.3676113772067355e-06, + "loss": 0.82654572, + "num_input_tokens_seen": 110117750, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.43164062, + "step": 5109, + "time_per_iteration": 3.2511489391326904 + }, + { + "auxiliary_loss_clip": 0.01504683, + "auxiliary_loss_mlp": 0.01267352, + "balance_loss_clip": 1.14222348, + "balance_loss_mlp": 1.02435565, + "epoch": 0.6144411711657548, + "flos": 25084607826240.0, + "grad_norm": 2.1515878256526952, + "language_loss": 0.7247299, + "learning_rate": 1.3668724191141671e-06, + "loss": 0.75245029, + "num_input_tokens_seen": 110137020, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.42578125, + "step": 5110, + "time_per_iteration": 3.091813564300537 + }, + { + "auxiliary_loss_clip": 0.01507625, + "auxiliary_loss_mlp": 0.01260336, + "balance_loss_clip": 1.14690518, + "balance_loss_mlp": 1.01848447, + "epoch": 0.6145614140563939, + "flos": 20115779677920.0, + "grad_norm": 2.177167543195841, + "language_loss": 0.66498893, + "learning_rate": 1.3661335570665493e-06, + "loss": 0.69266856, + "num_input_tokens_seen": 110154930, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.4140625, + "step": 5111, + "time_per_iteration": 3.1061716079711914 + }, + { + "auxiliary_loss_clip": 0.01511234, + "auxiliary_loss_mlp": 0.01275472, + "balance_loss_clip": 1.150033, + "balance_loss_mlp": 1.03304791, + "epoch": 0.614681656947033, + "flos": 16802266610880.0, + "grad_norm": 2.6198032379874876, + "language_loss": 0.70082587, + "learning_rate": 1.3653947911759676e-06, + "loss": 0.72869295, + "num_input_tokens_seen": 110172480, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.41992188, + "step": 5112, + "time_per_iteration": 3.175426483154297 + }, + { + "auxiliary_loss_clip": 0.01510432, + "auxiliary_loss_mlp": 0.01274029, + "balance_loss_clip": 1.14901948, + "balance_loss_mlp": 1.0312233, + "epoch": 0.6148018998376721, + "flos": 38804725696320.0, + "grad_norm": 13.312759254304954, + "language_loss": 0.74376225, + "learning_rate": 1.3646561215544904e-06, + "loss": 0.77160686, + "num_input_tokens_seen": 110197120, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.42382812, + "step": 5113, + "time_per_iteration": 3.289898157119751 + }, + { + "auxiliary_loss_clip": 0.0150291, + "auxiliary_loss_mlp": 0.01260249, + "balance_loss_clip": 1.14004874, + "balance_loss_mlp": 1.01858783, + "epoch": 0.6149221427283111, + "flos": 23329047525120.0, + "grad_norm": 2.7739630086037623, + "language_loss": 0.79429078, + "learning_rate": 1.363917548314176e-06, + "loss": 0.82192236, + "num_input_tokens_seen": 110216385, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.41210938, + "step": 5114, + "time_per_iteration": 3.0885493755340576 + }, + { + "auxiliary_loss_clip": 0.01501879, + "auxiliary_loss_mlp": 0.01271631, + "balance_loss_clip": 1.13848329, + "balance_loss_mlp": 1.02653646, + "epoch": 0.6150423856189503, + "flos": 22381516759680.0, + "grad_norm": 3.14648961799825, + "language_loss": 0.73434716, + "learning_rate": 1.3631790715670626e-06, + "loss": 0.76208222, + "num_input_tokens_seen": 110234790, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.44726562, + "step": 5115, + "time_per_iteration": 3.0151214599609375 + }, + { + "auxiliary_loss_clip": 0.0150625, + "auxiliary_loss_mlp": 0.01268331, + "balance_loss_clip": 1.14284027, + "balance_loss_mlp": 1.03010333, + "epoch": 0.6151626285095894, + "flos": 18693762894720.0, + "grad_norm": 2.1111867287522, + "language_loss": 0.8567186, + "learning_rate": 1.3624406914251783e-06, + "loss": 0.88446438, + "num_input_tokens_seen": 110251910, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.37695312, + "step": 5116, + "time_per_iteration": 2.9583473205566406 + }, + { + "auxiliary_loss_clip": 0.01501734, + "auxiliary_loss_mlp": 0.01266969, + "balance_loss_clip": 1.13940334, + "balance_loss_mlp": 1.02530837, + "epoch": 0.6152828714002284, + "flos": 15853522144320.0, + "grad_norm": 1.8438012565114907, + "language_loss": 0.88369459, + "learning_rate": 1.3617024080005335e-06, + "loss": 0.9113816, + "num_input_tokens_seen": 110268810, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.41210938, + "step": 5117, + "time_per_iteration": 3.9089725017547607 + }, + { + "auxiliary_loss_clip": 0.01498404, + "auxiliary_loss_mlp": 0.01289868, + "balance_loss_clip": 1.13677108, + "balance_loss_mlp": 1.04553699, + "epoch": 0.6154031142908676, + "flos": 24872473441440.0, + "grad_norm": 1.8504543063350647, + "language_loss": 0.74493301, + "learning_rate": 1.3609642214051266e-06, + "loss": 0.77281576, + "num_input_tokens_seen": 110293035, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.43945312, + "step": 5118, + "time_per_iteration": 3.8348379135131836 + }, + { + "auxiliary_loss_clip": 0.01508373, + "auxiliary_loss_mlp": 0.01275211, + "balance_loss_clip": 1.1473124, + "balance_loss_mlp": 1.03297734, + "epoch": 0.6155233571815066, + "flos": 19246760932320.0, + "grad_norm": 1.8940666672832045, + "language_loss": 0.66148996, + "learning_rate": 1.3602261317509385e-06, + "loss": 0.68932581, + "num_input_tokens_seen": 110309695, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.41796875, + "step": 5119, + "time_per_iteration": 2.9367823600769043 + }, + { + "auxiliary_loss_clip": 0.0150539, + "auxiliary_loss_mlp": 0.01266811, + "balance_loss_clip": 1.14374042, + "balance_loss_mlp": 1.02152562, + "epoch": 0.6156436000721457, + "flos": 18772047345600.0, + "grad_norm": 3.557372497379794, + "language_loss": 0.82833648, + "learning_rate": 1.3594881391499387e-06, + "loss": 0.85605848, + "num_input_tokens_seen": 110328610, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.44921875, + "step": 5120, + "time_per_iteration": 3.0400969982147217 + }, + { + "auxiliary_loss_clip": 0.01504711, + "auxiliary_loss_mlp": 0.01272528, + "balance_loss_clip": 1.14306688, + "balance_loss_mlp": 1.03067636, + "epoch": 0.6157638429627849, + "flos": 18043174608480.0, + "grad_norm": 2.204356840833308, + "language_loss": 0.79015201, + "learning_rate": 1.3587502437140778e-06, + "loss": 0.81792444, + "num_input_tokens_seen": 110346775, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.4140625, + "step": 5121, + "time_per_iteration": 3.094611644744873 + }, + { + "auxiliary_loss_clip": 0.01504874, + "auxiliary_loss_mlp": 0.0128064, + "balance_loss_clip": 1.14177012, + "balance_loss_mlp": 1.03745341, + "epoch": 0.6158840858534239, + "flos": 25559093844000.0, + "grad_norm": 2.5698682827233044, + "language_loss": 0.85114622, + "learning_rate": 1.3580124455552952e-06, + "loss": 0.87900138, + "num_input_tokens_seen": 110366140, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.42773438, + "step": 5122, + "time_per_iteration": 3.874220132827759 + }, + { + "auxiliary_loss_clip": 0.01506318, + "auxiliary_loss_mlp": 0.01269434, + "balance_loss_clip": 1.14417624, + "balance_loss_mlp": 1.0277729, + "epoch": 0.616004328744063, + "flos": 24642892103040.0, + "grad_norm": 3.456448959182476, + "language_loss": 0.8749482, + "learning_rate": 1.3572747447855148e-06, + "loss": 0.90270567, + "num_input_tokens_seen": 110386550, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.41210938, + "step": 5123, + "time_per_iteration": 3.1582186222076416 + }, + { + "auxiliary_loss_clip": 0.01512681, + "auxiliary_loss_mlp": 0.01262912, + "balance_loss_clip": 1.14929175, + "balance_loss_mlp": 1.02029765, + "epoch": 0.6161245716347021, + "flos": 21691862104320.0, + "grad_norm": 2.585837471211407, + "language_loss": 0.69552982, + "learning_rate": 1.356537141516644e-06, + "loss": 0.72328579, + "num_input_tokens_seen": 110403970, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.421875, + "step": 5124, + "time_per_iteration": 3.049002170562744 + }, + { + "auxiliary_loss_clip": 0.01510128, + "auxiliary_loss_mlp": 0.01271531, + "balance_loss_clip": 1.14925075, + "balance_loss_mlp": 1.03177726, + "epoch": 0.6162448145253412, + "flos": 35192146173120.0, + "grad_norm": 7.756680564882601, + "language_loss": 0.61323559, + "learning_rate": 1.3557996358605775e-06, + "loss": 0.64105213, + "num_input_tokens_seen": 110423890, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.39257812, + "step": 5125, + "time_per_iteration": 3.0652315616607666 + }, + { + "auxiliary_loss_clip": 0.0150732, + "auxiliary_loss_mlp": 0.01261379, + "balance_loss_clip": 1.14583826, + "balance_loss_mlp": 1.02086222, + "epoch": 0.6163650574159802, + "flos": 21617370469440.0, + "grad_norm": 2.5658494519965607, + "language_loss": 0.70174652, + "learning_rate": 1.3550622279291941e-06, + "loss": 0.72943354, + "num_input_tokens_seen": 110442035, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.40039062, + "step": 5126, + "time_per_iteration": 3.8846471309661865 + }, + { + "auxiliary_loss_clip": 0.01504539, + "auxiliary_loss_mlp": 0.01270404, + "balance_loss_clip": 1.14096618, + "balance_loss_mlp": 1.02893353, + "epoch": 0.6164853003066194, + "flos": 24574506901920.0, + "grad_norm": 1.4601633213932088, + "language_loss": 0.83294463, + "learning_rate": 1.354324917834358e-06, + "loss": 0.86069405, + "num_input_tokens_seen": 110463280, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.41015625, + "step": 5127, + "time_per_iteration": 3.0419938564300537 + }, + { + "auxiliary_loss_clip": 0.01505378, + "auxiliary_loss_mlp": 0.01258482, + "balance_loss_clip": 1.14292336, + "balance_loss_mlp": 1.01853716, + "epoch": 0.6166055431972585, + "flos": 21838190402880.0, + "grad_norm": 2.0622654791569612, + "language_loss": 0.77050287, + "learning_rate": 1.353587705687918e-06, + "loss": 0.79814148, + "num_input_tokens_seen": 110481455, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.39453125, + "step": 5128, + "time_per_iteration": 2.988196849822998 + }, + { + "auxiliary_loss_clip": 0.01506574, + "auxiliary_loss_mlp": 0.01266268, + "balance_loss_clip": 1.14488292, + "balance_loss_mlp": 1.02346265, + "epoch": 0.6167257860878975, + "flos": 17787194906400.0, + "grad_norm": 2.480715347691056, + "language_loss": 0.72841299, + "learning_rate": 1.3528505916017096e-06, + "loss": 0.75614136, + "num_input_tokens_seen": 110499155, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.42382812, + "step": 5129, + "time_per_iteration": 2.999920129776001 + }, + { + "auxiliary_loss_clip": 0.01512363, + "auxiliary_loss_mlp": 0.01269987, + "balance_loss_clip": 1.15089369, + "balance_loss_mlp": 1.02889824, + "epoch": 0.6168460289785367, + "flos": 23216665294080.0, + "grad_norm": 2.786133197749024, + "language_loss": 0.8861202, + "learning_rate": 1.3521135756875514e-06, + "loss": 0.91394365, + "num_input_tokens_seen": 110515470, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.40625, + "step": 5130, + "time_per_iteration": 2.987109661102295 + }, + { + "auxiliary_loss_clip": 0.01505805, + "auxiliary_loss_mlp": 0.01262202, + "balance_loss_clip": 1.14336789, + "balance_loss_mlp": 1.02302051, + "epoch": 0.6169662718691757, + "flos": 26215674779520.0, + "grad_norm": 1.6653407585192532, + "language_loss": 0.86249244, + "learning_rate": 1.3513766580572496e-06, + "loss": 0.89017248, + "num_input_tokens_seen": 110538290, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.38671875, + "step": 5131, + "time_per_iteration": 3.076448917388916 + }, + { + "auxiliary_loss_clip": 0.01508415, + "auxiliary_loss_mlp": 0.01264969, + "balance_loss_clip": 1.14567685, + "balance_loss_mlp": 1.02674115, + "epoch": 0.6170865147598148, + "flos": 19028140832160.0, + "grad_norm": 3.5314170746529623, + "language_loss": 0.77080381, + "learning_rate": 1.3506398388225924e-06, + "loss": 0.79853761, + "num_input_tokens_seen": 110555610, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.37695312, + "step": 5132, + "time_per_iteration": 3.1203112602233887 + }, + { + "auxiliary_loss_clip": 0.01511675, + "auxiliary_loss_mlp": 0.01268154, + "balance_loss_clip": 1.14940977, + "balance_loss_mlp": 1.02801836, + "epoch": 0.617206757650454, + "flos": 18262363631040.0, + "grad_norm": 1.9514379726782425, + "language_loss": 0.71750659, + "learning_rate": 1.349903118095355e-06, + "loss": 0.74530494, + "num_input_tokens_seen": 110574745, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.39648438, + "step": 5133, + "time_per_iteration": 3.2005512714385986 + }, + { + "auxiliary_loss_clip": 0.01507248, + "auxiliary_loss_mlp": 0.0126807, + "balance_loss_clip": 1.14490664, + "balance_loss_mlp": 1.02564585, + "epoch": 0.617327000541093, + "flos": 18188933984640.0, + "grad_norm": 1.848412486295523, + "language_loss": 0.73305464, + "learning_rate": 1.349166495987298e-06, + "loss": 0.76080787, + "num_input_tokens_seen": 110593310, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.41992188, + "step": 5134, + "time_per_iteration": 3.0085272789001465 + }, + { + "auxiliary_loss_clip": 0.01499842, + "auxiliary_loss_mlp": 0.01197929, + "balance_loss_clip": 1.14589512, + "balance_loss_mlp": 1.001091, + "epoch": 0.6174472434317321, + "flos": 61840812834720.0, + "grad_norm": 0.8415515321647375, + "language_loss": 0.60859883, + "learning_rate": 1.348429972610166e-06, + "loss": 0.63557655, + "num_input_tokens_seen": 110657615, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 1.96484375, + "step": 5135, + "time_per_iteration": 3.6241633892059326 + }, + { + "auxiliary_loss_clip": 0.01497615, + "auxiliary_loss_mlp": 0.01193573, + "balance_loss_clip": 1.14368844, + "balance_loss_mlp": 0.99673462, + "epoch": 0.6175674863223712, + "flos": 71237608469280.0, + "grad_norm": 0.855208289842371, + "language_loss": 0.57705617, + "learning_rate": 1.3476935480756897e-06, + "loss": 0.60396808, + "num_input_tokens_seen": 110714365, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 1.96484375, + "step": 5136, + "time_per_iteration": 3.282256841659546 + }, + { + "auxiliary_loss_clip": 0.01508422, + "auxiliary_loss_mlp": 0.01268974, + "balance_loss_clip": 1.14590716, + "balance_loss_mlp": 1.02731252, + "epoch": 0.6176877292130103, + "flos": 21837621480480.0, + "grad_norm": 2.4407082165780145, + "language_loss": 0.75286591, + "learning_rate": 1.346957222495583e-06, + "loss": 0.78063989, + "num_input_tokens_seen": 110732160, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.41210938, + "step": 5137, + "time_per_iteration": 3.035953998565674 + }, + { + "auxiliary_loss_clip": 0.01506073, + "auxiliary_loss_mlp": 0.01267193, + "balance_loss_clip": 1.1443938, + "balance_loss_mlp": 1.0251509, + "epoch": 0.6178079721036493, + "flos": 17743273732800.0, + "grad_norm": 2.6021877745922177, + "language_loss": 0.71089244, + "learning_rate": 1.3462209959815466e-06, + "loss": 0.73862517, + "num_input_tokens_seen": 110746900, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.41601562, + "step": 5138, + "time_per_iteration": 2.969257354736328 + }, + { + "auxiliary_loss_clip": 0.01508351, + "auxiliary_loss_mlp": 0.0127032, + "balance_loss_clip": 1.14621222, + "balance_loss_mlp": 1.02770543, + "epoch": 0.6179282149942885, + "flos": 22635069059520.0, + "grad_norm": 1.886085197118172, + "language_loss": 0.74709982, + "learning_rate": 1.345484868645265e-06, + "loss": 0.77488655, + "num_input_tokens_seen": 110765710, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.421875, + "step": 5139, + "time_per_iteration": 3.092662811279297 + }, + { + "auxiliary_loss_clip": 0.0150633, + "auxiliary_loss_mlp": 0.01270404, + "balance_loss_clip": 1.14241457, + "balance_loss_mlp": 1.03026927, + "epoch": 0.6180484578849276, + "flos": 22312562636160.0, + "grad_norm": 1.9689123138563305, + "language_loss": 0.79001212, + "learning_rate": 1.3447488405984088e-06, + "loss": 0.81777948, + "num_input_tokens_seen": 110783970, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.39648438, + "step": 5140, + "time_per_iteration": 2.9975669384002686 + }, + { + "auxiliary_loss_clip": 0.01502541, + "auxiliary_loss_mlp": 0.01273396, + "balance_loss_clip": 1.13962197, + "balance_loss_mlp": 1.03268886, + "epoch": 0.6181687007755666, + "flos": 35228367930240.0, + "grad_norm": 2.592669567696409, + "language_loss": 0.70282233, + "learning_rate": 1.3440129119526322e-06, + "loss": 0.7305817, + "num_input_tokens_seen": 110806395, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.40234375, + "step": 5141, + "time_per_iteration": 3.069882392883301 + }, + { + "auxiliary_loss_clip": 0.01494787, + "auxiliary_loss_mlp": 0.01221374, + "balance_loss_clip": 1.13971639, + "balance_loss_mlp": 1.02377319, + "epoch": 0.6182889436662057, + "flos": 61552632124800.0, + "grad_norm": 0.8224125383349357, + "language_loss": 0.51176369, + "learning_rate": 1.3432770828195762e-06, + "loss": 0.53892529, + "num_input_tokens_seen": 110867380, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 1.9765625, + "step": 5142, + "time_per_iteration": 3.6108498573303223 + }, + { + "auxiliary_loss_clip": 0.01503887, + "auxiliary_loss_mlp": 0.01270313, + "balance_loss_clip": 1.14064837, + "balance_loss_mlp": 1.02903342, + "epoch": 0.6184091865568448, + "flos": 19612050684480.0, + "grad_norm": 3.1909472877010057, + "language_loss": 0.70617896, + "learning_rate": 1.3425413533108635e-06, + "loss": 0.73392093, + "num_input_tokens_seen": 110885980, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.40820312, + "step": 5143, + "time_per_iteration": 3.1270577907562256 + }, + { + "auxiliary_loss_clip": 0.01508408, + "auxiliary_loss_mlp": 0.01274354, + "balance_loss_clip": 1.14394927, + "balance_loss_mlp": 1.03040397, + "epoch": 0.6185294294474839, + "flos": 23589161396640.0, + "grad_norm": 2.963063609495028, + "language_loss": 0.70676196, + "learning_rate": 1.341805723538105e-06, + "loss": 0.73458958, + "num_input_tokens_seen": 110906085, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.43554688, + "step": 5144, + "time_per_iteration": 3.962846040725708 + }, + { + "auxiliary_loss_clip": 0.01501597, + "auxiliary_loss_mlp": 0.01277074, + "balance_loss_clip": 1.13680387, + "balance_loss_mlp": 1.03674853, + "epoch": 0.618649672338123, + "flos": 26764538647680.0, + "grad_norm": 2.568083524968158, + "language_loss": 0.77800667, + "learning_rate": 1.3410701936128948e-06, + "loss": 0.8057934, + "num_input_tokens_seen": 110928865, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.3984375, + "step": 5145, + "time_per_iteration": 3.92433762550354 + }, + { + "auxiliary_loss_clip": 0.01506623, + "auxiliary_loss_mlp": 0.01288573, + "balance_loss_clip": 1.14294136, + "balance_loss_mlp": 1.04691172, + "epoch": 0.6187699152287621, + "flos": 14457903724800.0, + "grad_norm": 2.4671004522687414, + "language_loss": 0.85387075, + "learning_rate": 1.340334763646812e-06, + "loss": 0.88182271, + "num_input_tokens_seen": 110943000, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.41210938, + "step": 5146, + "time_per_iteration": 3.031174421310425 + }, + { + "auxiliary_loss_clip": 0.01503197, + "auxiliary_loss_mlp": 0.01285566, + "balance_loss_clip": 1.13965118, + "balance_loss_mlp": 1.04104376, + "epoch": 0.6188901581194012, + "flos": 20086878055680.0, + "grad_norm": 2.349943776332016, + "language_loss": 0.74599522, + "learning_rate": 1.3395994337514218e-06, + "loss": 0.77388287, + "num_input_tokens_seen": 110963170, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.44140625, + "step": 5147, + "time_per_iteration": 2.9892430305480957 + }, + { + "auxiliary_loss_clip": 0.01503413, + "auxiliary_loss_mlp": 0.01275886, + "balance_loss_clip": 1.13990021, + "balance_loss_mlp": 1.037467, + "epoch": 0.6190104010100402, + "flos": 25702994740320.0, + "grad_norm": 2.2261074683893756, + "language_loss": 0.78568643, + "learning_rate": 1.3388642040382725e-06, + "loss": 0.81347936, + "num_input_tokens_seen": 110983595, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.37890625, + "step": 5148, + "time_per_iteration": 3.2309300899505615 + }, + { + "auxiliary_loss_clip": 0.0150863, + "auxiliary_loss_mlp": 0.01275845, + "balance_loss_clip": 1.14589095, + "balance_loss_mlp": 1.03246725, + "epoch": 0.6191306439006794, + "flos": 30444820665120.0, + "grad_norm": 2.442885631813327, + "language_loss": 0.84126961, + "learning_rate": 1.3381290746188975e-06, + "loss": 0.8691144, + "num_input_tokens_seen": 111002965, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.4296875, + "step": 5149, + "time_per_iteration": 3.9769387245178223 + }, + { + "auxiliary_loss_clip": 0.01508899, + "auxiliary_loss_mlp": 0.0127235, + "balance_loss_clip": 1.1457839, + "balance_loss_mlp": 1.02916265, + "epoch": 0.6192508867913185, + "flos": 26688112676640.0, + "grad_norm": 1.8263386021250627, + "language_loss": 0.67349631, + "learning_rate": 1.3373940456048152e-06, + "loss": 0.70130879, + "num_input_tokens_seen": 111022990, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.42773438, + "step": 5150, + "time_per_iteration": 3.0866456031799316 + }, + { + "auxiliary_loss_clip": 0.01505674, + "auxiliary_loss_mlp": 0.01266295, + "balance_loss_clip": 1.14322126, + "balance_loss_mlp": 1.02501559, + "epoch": 0.6193711296819575, + "flos": 36725065989120.0, + "grad_norm": 2.0283680520092884, + "language_loss": 0.59369266, + "learning_rate": 1.3366591171075299e-06, + "loss": 0.6214124, + "num_input_tokens_seen": 111046495, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.40820312, + "step": 5151, + "time_per_iteration": 3.2253434658050537 + }, + { + "auxiliary_loss_clip": 0.01500931, + "auxiliary_loss_mlp": 0.01266916, + "balance_loss_clip": 1.13695955, + "balance_loss_mlp": 1.02525449, + "epoch": 0.6194913725725967, + "flos": 25194941936640.0, + "grad_norm": 2.4848397109526634, + "language_loss": 0.90885431, + "learning_rate": 1.335924289238529e-06, + "loss": 0.93653274, + "num_input_tokens_seen": 111065705, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.41210938, + "step": 5152, + "time_per_iteration": 3.1611990928649902 + }, + { + "auxiliary_loss_clip": 0.01505231, + "auxiliary_loss_mlp": 0.01273279, + "balance_loss_clip": 1.14276397, + "balance_loss_mlp": 1.02856636, + "epoch": 0.6196116154632357, + "flos": 21180964688640.0, + "grad_norm": 1.7598887968051327, + "language_loss": 0.77066386, + "learning_rate": 1.3351895621092859e-06, + "loss": 0.79844886, + "num_input_tokens_seen": 111086050, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.44335938, + "step": 5153, + "time_per_iteration": 3.0381646156311035 + }, + { + "auxiliary_loss_clip": 0.01505055, + "auxiliary_loss_mlp": 0.01260332, + "balance_loss_clip": 1.14293385, + "balance_loss_mlp": 1.01981509, + "epoch": 0.6197318583538748, + "flos": 16255640504160.0, + "grad_norm": 2.306785376055112, + "language_loss": 0.76499432, + "learning_rate": 1.3344549358312567e-06, + "loss": 0.7926482, + "num_input_tokens_seen": 111104450, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.40039062, + "step": 5154, + "time_per_iteration": 3.8214168548583984 + }, + { + "auxiliary_loss_clip": 0.01511829, + "auxiliary_loss_mlp": 0.01263672, + "balance_loss_clip": 1.14925444, + "balance_loss_mlp": 1.02162898, + "epoch": 0.619852101244514, + "flos": 24427913106240.0, + "grad_norm": 2.25666333995671, + "language_loss": 0.78664994, + "learning_rate": 1.3337204105158852e-06, + "loss": 0.81440496, + "num_input_tokens_seen": 111123320, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.41601562, + "step": 5155, + "time_per_iteration": 3.026427984237671 + }, + { + "auxiliary_loss_clip": 0.01501734, + "auxiliary_loss_mlp": 0.01268534, + "balance_loss_clip": 1.13806224, + "balance_loss_mlp": 1.02687263, + "epoch": 0.619972344135153, + "flos": 16729178317920.0, + "grad_norm": 2.011142220648448, + "language_loss": 0.73269081, + "learning_rate": 1.332985986274597e-06, + "loss": 0.7603935, + "num_input_tokens_seen": 111140950, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.41210938, + "step": 5156, + "time_per_iteration": 2.954245090484619 + }, + { + "auxiliary_loss_clip": 0.01514312, + "auxiliary_loss_mlp": 0.01263223, + "balance_loss_clip": 1.1532743, + "balance_loss_mlp": 1.02556765, + "epoch": 0.6200925870257921, + "flos": 12496846466880.0, + "grad_norm": 1.956094342956332, + "language_loss": 0.75140274, + "learning_rate": 1.3322516632188047e-06, + "loss": 0.77917808, + "num_input_tokens_seen": 111157845, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.37109375, + "step": 5157, + "time_per_iteration": 2.961031913757324 + }, + { + "auxiliary_loss_clip": 0.01499473, + "auxiliary_loss_mlp": 0.01268081, + "balance_loss_clip": 1.13563228, + "balance_loss_mlp": 1.02775526, + "epoch": 0.6202128299164312, + "flos": 26541632665440.0, + "grad_norm": 1.8013201624905455, + "language_loss": 0.6697371, + "learning_rate": 1.3315174414599045e-06, + "loss": 0.69741261, + "num_input_tokens_seen": 111179165, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.3984375, + "step": 5158, + "time_per_iteration": 2.993204116821289 + }, + { + "auxiliary_loss_clip": 0.01503696, + "auxiliary_loss_mlp": 0.01267494, + "balance_loss_clip": 1.1407119, + "balance_loss_mlp": 1.02583277, + "epoch": 0.6203330728070703, + "flos": 18772502483520.0, + "grad_norm": 1.8518611379645014, + "language_loss": 0.75546449, + "learning_rate": 1.3307833211092768e-06, + "loss": 0.78317642, + "num_input_tokens_seen": 111197830, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.41210938, + "step": 5159, + "time_per_iteration": 2.941619873046875 + }, + { + "auxiliary_loss_clip": 0.01504448, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 1.14135766, + "balance_loss_mlp": 1.02307665, + "epoch": 0.6204533156977093, + "flos": 20631797395200.0, + "grad_norm": 1.891793159019614, + "language_loss": 0.75232023, + "learning_rate": 1.3300493022782873e-06, + "loss": 0.78002739, + "num_input_tokens_seen": 111218400, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.42773438, + "step": 5160, + "time_per_iteration": 2.975917339324951 + }, + { + "auxiliary_loss_clip": 0.01494839, + "auxiliary_loss_mlp": 0.01273909, + "balance_loss_clip": 1.13204575, + "balance_loss_mlp": 1.03339267, + "epoch": 0.6205735585883485, + "flos": 17349992634240.0, + "grad_norm": 2.1991502086785446, + "language_loss": 0.72670496, + "learning_rate": 1.3293153850782855e-06, + "loss": 0.7543925, + "num_input_tokens_seen": 111236720, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.40039062, + "step": 5161, + "time_per_iteration": 2.9421799182891846 + }, + { + "auxiliary_loss_clip": 0.01503541, + "auxiliary_loss_mlp": 0.01288537, + "balance_loss_clip": 1.13873053, + "balance_loss_mlp": 1.04439616, + "epoch": 0.6206938014789876, + "flos": 22967171307360.0, + "grad_norm": 6.862556354402105, + "language_loss": 0.71293294, + "learning_rate": 1.3285815696206069e-06, + "loss": 0.74085373, + "num_input_tokens_seen": 111258265, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 2.4375, + "step": 5162, + "time_per_iteration": 3.160507917404175 + }, + { + "auxiliary_loss_clip": 0.01499681, + "auxiliary_loss_mlp": 0.01276223, + "balance_loss_clip": 1.13551521, + "balance_loss_mlp": 1.0320828, + "epoch": 0.6208140443696266, + "flos": 23985211250880.0, + "grad_norm": 1.909359014529882, + "language_loss": 0.77146888, + "learning_rate": 1.32784785601657e-06, + "loss": 0.79922795, + "num_input_tokens_seen": 111277675, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.4375, + "step": 5163, + "time_per_iteration": 3.0366861820220947 + }, + { + "auxiliary_loss_clip": 0.01500156, + "auxiliary_loss_mlp": 0.01278603, + "balance_loss_clip": 1.13578904, + "balance_loss_mlp": 1.0361793, + "epoch": 0.6209342872602658, + "flos": 35080067367360.0, + "grad_norm": 1.8551494628761336, + "language_loss": 0.73646796, + "learning_rate": 1.3271142443774798e-06, + "loss": 0.76425552, + "num_input_tokens_seen": 111299910, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.41992188, + "step": 5164, + "time_per_iteration": 3.088069438934326 + }, + { + "auxiliary_loss_clip": 0.01506192, + "auxiliary_loss_mlp": 0.01269583, + "balance_loss_clip": 1.14271545, + "balance_loss_mlp": 1.02868509, + "epoch": 0.6210545301509048, + "flos": 26981755405920.0, + "grad_norm": 2.144077196053247, + "language_loss": 0.81166518, + "learning_rate": 1.3263807348146228e-06, + "loss": 0.83942294, + "num_input_tokens_seen": 111319765, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.40429688, + "step": 5165, + "time_per_iteration": 3.092212677001953 + }, + { + "auxiliary_loss_clip": 0.01506433, + "auxiliary_loss_mlp": 0.01264043, + "balance_loss_clip": 1.14377666, + "balance_loss_mlp": 1.02104688, + "epoch": 0.6211747730415439, + "flos": 33620956479360.0, + "grad_norm": 5.849880754241308, + "language_loss": 0.73579186, + "learning_rate": 1.3256473274392733e-06, + "loss": 0.76349664, + "num_input_tokens_seen": 111341110, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.42578125, + "step": 5166, + "time_per_iteration": 3.159874200820923 + }, + { + "auxiliary_loss_clip": 0.01502656, + "auxiliary_loss_mlp": 0.01260789, + "balance_loss_clip": 1.13903904, + "balance_loss_mlp": 1.01970065, + "epoch": 0.6212950159321831, + "flos": 34169934132000.0, + "grad_norm": 2.0619094406493756, + "language_loss": 0.70336902, + "learning_rate": 1.3249140223626873e-06, + "loss": 0.73100346, + "num_input_tokens_seen": 111362730, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.40625, + "step": 5167, + "time_per_iteration": 3.1806390285491943 + }, + { + "auxiliary_loss_clip": 0.01504421, + "auxiliary_loss_mlp": 0.0125607, + "balance_loss_clip": 1.14073217, + "balance_loss_mlp": 1.01593518, + "epoch": 0.6214152588228221, + "flos": 27968314612320.0, + "grad_norm": 2.2985505571018385, + "language_loss": 0.7529844, + "learning_rate": 1.3241808196961077e-06, + "loss": 0.78058934, + "num_input_tokens_seen": 111383855, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.39648438, + "step": 5168, + "time_per_iteration": 3.070039749145508 + }, + { + "auxiliary_loss_clip": 0.01499763, + "auxiliary_loss_mlp": 0.01265313, + "balance_loss_clip": 1.13684583, + "balance_loss_mlp": 1.0263226, + "epoch": 0.6215355017134612, + "flos": 20232296078400.0, + "grad_norm": 1.8543281578921489, + "language_loss": 0.70741373, + "learning_rate": 1.3234477195507608e-06, + "loss": 0.73506451, + "num_input_tokens_seen": 111402685, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.38476562, + "step": 5169, + "time_per_iteration": 3.0351860523223877 + }, + { + "auxiliary_loss_clip": 0.01500893, + "auxiliary_loss_mlp": 0.01278476, + "balance_loss_clip": 1.13712454, + "balance_loss_mlp": 1.03624272, + "epoch": 0.6216557446041003, + "flos": 41431390791840.0, + "grad_norm": 2.8121089610094105, + "language_loss": 0.6281718, + "learning_rate": 1.322714722037857e-06, + "loss": 0.65596557, + "num_input_tokens_seen": 111424130, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.41796875, + "step": 5170, + "time_per_iteration": 3.1871719360351562 + }, + { + "auxiliary_loss_clip": 0.01508692, + "auxiliary_loss_mlp": 0.01272577, + "balance_loss_clip": 1.14608216, + "balance_loss_mlp": 1.0297718, + "epoch": 0.6217759874947394, + "flos": 27931296363840.0, + "grad_norm": 2.3433879084076974, + "language_loss": 0.77703327, + "learning_rate": 1.321981827268591e-06, + "loss": 0.80484593, + "num_input_tokens_seen": 111444785, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.42382812, + "step": 5171, + "time_per_iteration": 4.046096563339233 + }, + { + "auxiliary_loss_clip": 0.01504141, + "auxiliary_loss_mlp": 0.01262001, + "balance_loss_clip": 1.14294553, + "balance_loss_mlp": 1.02186584, + "epoch": 0.6218962303853784, + "flos": 21768022578240.0, + "grad_norm": 1.8921644332226428, + "language_loss": 0.81516504, + "learning_rate": 1.3212490353541426e-06, + "loss": 0.84282643, + "num_input_tokens_seen": 111467045, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.39648438, + "step": 5172, + "time_per_iteration": 3.0344817638397217 + }, + { + "auxiliary_loss_clip": 0.01505089, + "auxiliary_loss_mlp": 0.01263151, + "balance_loss_clip": 1.14291358, + "balance_loss_mlp": 1.02053618, + "epoch": 0.6220164732760175, + "flos": 21248098260480.0, + "grad_norm": 2.2505902687984287, + "language_loss": 0.80711961, + "learning_rate": 1.3205163464056762e-06, + "loss": 0.83480203, + "num_input_tokens_seen": 111483650, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.421875, + "step": 5173, + "time_per_iteration": 3.776447296142578 + }, + { + "auxiliary_loss_clip": 0.01504496, + "auxiliary_loss_mlp": 0.01259871, + "balance_loss_clip": 1.14128387, + "balance_loss_mlp": 1.01973593, + "epoch": 0.6221367161666567, + "flos": 26138528173440.0, + "grad_norm": 2.3092380355353224, + "language_loss": 0.7273429, + "learning_rate": 1.319783760534339e-06, + "loss": 0.75498658, + "num_input_tokens_seen": 111502895, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.39648438, + "step": 5174, + "time_per_iteration": 3.0551581382751465 + }, + { + "auxiliary_loss_clip": 0.01502621, + "auxiliary_loss_mlp": 0.01266988, + "balance_loss_clip": 1.13832927, + "balance_loss_mlp": 1.02360988, + "epoch": 0.6222569590572957, + "flos": 16285755827520.0, + "grad_norm": 2.4136466012393014, + "language_loss": 0.75475764, + "learning_rate": 1.319051277851266e-06, + "loss": 0.78245372, + "num_input_tokens_seen": 111519180, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.4296875, + "step": 5175, + "time_per_iteration": 2.9292361736297607 + }, + { + "auxiliary_loss_clip": 0.01506403, + "auxiliary_loss_mlp": 0.01264029, + "balance_loss_clip": 1.1436379, + "balance_loss_mlp": 1.02522933, + "epoch": 0.6223772019479348, + "flos": 18225838448640.0, + "grad_norm": 2.2539061904760955, + "language_loss": 0.84183091, + "learning_rate": 1.3183188984675716e-06, + "loss": 0.86953521, + "num_input_tokens_seen": 111537545, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.3828125, + "step": 5176, + "time_per_iteration": 3.1824724674224854 + }, + { + "auxiliary_loss_clip": 0.01505765, + "auxiliary_loss_mlp": 0.01275911, + "balance_loss_clip": 1.14385605, + "balance_loss_mlp": 1.03424954, + "epoch": 0.6224974448385739, + "flos": 27492235611840.0, + "grad_norm": 2.696972885826897, + "language_loss": 0.7112624, + "learning_rate": 1.3175866224943586e-06, + "loss": 0.73907924, + "num_input_tokens_seen": 111556265, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.41210938, + "step": 5177, + "time_per_iteration": 3.955582857131958 + }, + { + "auxiliary_loss_clip": 0.01504137, + "auxiliary_loss_mlp": 0.01274023, + "balance_loss_clip": 1.14150763, + "balance_loss_mlp": 1.03217125, + "epoch": 0.622617687729213, + "flos": 19793993889600.0, + "grad_norm": 2.6100291797608066, + "language_loss": 0.73294485, + "learning_rate": 1.316854450042712e-06, + "loss": 0.76072645, + "num_input_tokens_seen": 111574205, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.4140625, + "step": 5178, + "time_per_iteration": 3.126047134399414 + }, + { + "auxiliary_loss_clip": 0.0150866, + "auxiliary_loss_mlp": 0.01262267, + "balance_loss_clip": 1.14722109, + "balance_loss_mlp": 1.02175069, + "epoch": 0.622737930619852, + "flos": 23041094019840.0, + "grad_norm": 2.517494232497417, + "language_loss": 0.74309421, + "learning_rate": 1.3161223812237024e-06, + "loss": 0.77080345, + "num_input_tokens_seen": 111593560, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.40039062, + "step": 5179, + "time_per_iteration": 2.9810245037078857 + }, + { + "auxiliary_loss_clip": 0.01501666, + "auxiliary_loss_mlp": 0.0126677, + "balance_loss_clip": 1.13789034, + "balance_loss_mlp": 1.02510905, + "epoch": 0.6228581735104912, + "flos": 12635551205280.0, + "grad_norm": 2.6507475831062592, + "language_loss": 0.85478979, + "learning_rate": 1.3153904161483842e-06, + "loss": 0.88247412, + "num_input_tokens_seen": 111608860, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.41210938, + "step": 5180, + "time_per_iteration": 2.9978764057159424 + }, + { + "auxiliary_loss_clip": 0.01499658, + "auxiliary_loss_mlp": 0.01266783, + "balance_loss_clip": 1.13706362, + "balance_loss_mlp": 1.02416873, + "epoch": 0.6229784164011303, + "flos": 23804936884800.0, + "grad_norm": 2.4592434210867533, + "language_loss": 0.85414159, + "learning_rate": 1.3146585549277953e-06, + "loss": 0.88180608, + "num_input_tokens_seen": 111627500, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.421875, + "step": 5181, + "time_per_iteration": 3.0859856605529785 + }, + { + "auxiliary_loss_clip": 0.01508475, + "auxiliary_loss_mlp": 0.01267888, + "balance_loss_clip": 1.14696896, + "balance_loss_mlp": 1.02527356, + "epoch": 0.6230986592917693, + "flos": 22416145534080.0, + "grad_norm": 2.113815700515498, + "language_loss": 0.78536403, + "learning_rate": 1.3139267976729591e-06, + "loss": 0.8131277, + "num_input_tokens_seen": 111647690, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.421875, + "step": 5182, + "time_per_iteration": 3.8691532611846924 + }, + { + "auxiliary_loss_clip": 0.01501715, + "auxiliary_loss_mlp": 0.01266407, + "balance_loss_clip": 1.13971484, + "balance_loss_mlp": 1.02360117, + "epoch": 0.6232189021824085, + "flos": 34530862145760.0, + "grad_norm": 2.7974091333944267, + "language_loss": 0.71870798, + "learning_rate": 1.3131951444948815e-06, + "loss": 0.74638921, + "num_input_tokens_seen": 111667090, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.42382812, + "step": 5183, + "time_per_iteration": 3.128101348876953 + }, + { + "auxiliary_loss_clip": 0.01509552, + "auxiliary_loss_mlp": 0.0127224, + "balance_loss_clip": 1.14644039, + "balance_loss_mlp": 1.03019691, + "epoch": 0.6233391450730476, + "flos": 22239208846080.0, + "grad_norm": 1.9709879548923224, + "language_loss": 0.76207829, + "learning_rate": 1.3124635955045546e-06, + "loss": 0.78989625, + "num_input_tokens_seen": 111686905, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.41601562, + "step": 5184, + "time_per_iteration": 3.109468698501587 + }, + { + "auxiliary_loss_clip": 0.01501352, + "auxiliary_loss_mlp": 0.01265852, + "balance_loss_clip": 1.13974023, + "balance_loss_mlp": 1.02609825, + "epoch": 0.6234593879636866, + "flos": 20334399778080.0, + "grad_norm": 2.3979715627856337, + "language_loss": 0.84110713, + "learning_rate": 1.3117321508129537e-06, + "loss": 0.86877918, + "num_input_tokens_seen": 111704985, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.39257812, + "step": 5185, + "time_per_iteration": 3.04189395904541 + }, + { + "auxiliary_loss_clip": 0.01505822, + "auxiliary_loss_mlp": 0.01267231, + "balance_loss_clip": 1.14347506, + "balance_loss_mlp": 1.0246166, + "epoch": 0.6235796308543258, + "flos": 20666843379360.0, + "grad_norm": 1.8440615176136297, + "language_loss": 0.76412565, + "learning_rate": 1.3110008105310388e-06, + "loss": 0.79185617, + "num_input_tokens_seen": 111724805, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.421875, + "step": 5186, + "time_per_iteration": 3.0372533798217773 + }, + { + "auxiliary_loss_clip": 0.01503466, + "auxiliary_loss_mlp": 0.01270613, + "balance_loss_clip": 1.14035439, + "balance_loss_mlp": 1.02742612, + "epoch": 0.6236998737449648, + "flos": 26621282530080.0, + "grad_norm": 2.0347459388508136, + "language_loss": 0.77859992, + "learning_rate": 1.3102695747697526e-06, + "loss": 0.80634069, + "num_input_tokens_seen": 111747675, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.42773438, + "step": 5187, + "time_per_iteration": 3.122178316116333 + }, + { + "auxiliary_loss_clip": 0.0150831, + "auxiliary_loss_mlp": 0.01274097, + "balance_loss_clip": 1.14571941, + "balance_loss_mlp": 1.02957499, + "epoch": 0.6238201166356039, + "flos": 12676096772640.0, + "grad_norm": 3.2587037293447922, + "language_loss": 0.90686595, + "learning_rate": 1.3095384436400237e-06, + "loss": 0.93469, + "num_input_tokens_seen": 111759205, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.44140625, + "step": 5188, + "time_per_iteration": 3.0796687602996826 + }, + { + "auxiliary_loss_clip": 0.01503188, + "auxiliary_loss_mlp": 0.01272698, + "balance_loss_clip": 1.1411835, + "balance_loss_mlp": 1.03103673, + "epoch": 0.623940359526243, + "flos": 10453711942080.0, + "grad_norm": 2.5719165940340614, + "language_loss": 0.82416093, + "learning_rate": 1.3088074172527633e-06, + "loss": 0.85191977, + "num_input_tokens_seen": 111776335, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.41210938, + "step": 5189, + "time_per_iteration": 3.052161931991577 + }, + { + "auxiliary_loss_clip": 0.01498943, + "auxiliary_loss_mlp": 0.01264053, + "balance_loss_clip": 1.13705766, + "balance_loss_mlp": 1.02048421, + "epoch": 0.6240606024168821, + "flos": 29062097820000.0, + "grad_norm": 1.9655339339642453, + "language_loss": 0.71482897, + "learning_rate": 1.3080764957188684e-06, + "loss": 0.74245894, + "num_input_tokens_seen": 111796580, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.43164062, + "step": 5190, + "time_per_iteration": 3.0789105892181396 + }, + { + "auxiliary_loss_clip": 0.01502974, + "auxiliary_loss_mlp": 0.01264589, + "balance_loss_clip": 1.1410315, + "balance_loss_mlp": 1.02235603, + "epoch": 0.6241808453075212, + "flos": 22020285320640.0, + "grad_norm": 2.0025537993588847, + "language_loss": 0.71233261, + "learning_rate": 1.3073456791492192e-06, + "loss": 0.74000823, + "num_input_tokens_seen": 111816290, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.41796875, + "step": 5191, + "time_per_iteration": 3.0064804553985596 + }, + { + "auxiliary_loss_clip": 0.0149925, + "auxiliary_loss_mlp": 0.01265016, + "balance_loss_clip": 1.13732505, + "balance_loss_mlp": 1.02354586, + "epoch": 0.6243010881981603, + "flos": 21140646690240.0, + "grad_norm": 2.189942493729794, + "language_loss": 0.78532362, + "learning_rate": 1.3066149676546801e-06, + "loss": 0.81296623, + "num_input_tokens_seen": 111834470, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.41015625, + "step": 5192, + "time_per_iteration": 3.0161867141723633 + }, + { + "auxiliary_loss_clip": 0.01500134, + "auxiliary_loss_mlp": 0.01263764, + "balance_loss_clip": 1.13746905, + "balance_loss_mlp": 1.0264895, + "epoch": 0.6244213310887994, + "flos": 22347077626080.0, + "grad_norm": 2.8529124538946378, + "language_loss": 0.66457915, + "learning_rate": 1.3058843613460985e-06, + "loss": 0.69221812, + "num_input_tokens_seen": 111852410, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.3671875, + "step": 5193, + "time_per_iteration": 3.0230462551116943 + }, + { + "auxiliary_loss_clip": 0.01502845, + "auxiliary_loss_mlp": 0.01268364, + "balance_loss_clip": 1.13999975, + "balance_loss_mlp": 1.02613032, + "epoch": 0.6245415739794384, + "flos": 15233807744640.0, + "grad_norm": 2.0356472731300483, + "language_loss": 0.74614942, + "learning_rate": 1.3051538603343075e-06, + "loss": 0.77386147, + "num_input_tokens_seen": 111870340, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.41796875, + "step": 5194, + "time_per_iteration": 2.9826152324676514 + }, + { + "auxiliary_loss_clip": 0.0150292, + "auxiliary_loss_mlp": 0.01268732, + "balance_loss_clip": 1.14085209, + "balance_loss_mlp": 1.02611697, + "epoch": 0.6246618168700776, + "flos": 18881888389920.0, + "grad_norm": 3.4425755827110875, + "language_loss": 0.67827964, + "learning_rate": 1.3044234647301235e-06, + "loss": 0.70599616, + "num_input_tokens_seen": 111888365, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.421875, + "step": 5195, + "time_per_iteration": 3.2183218002319336 + }, + { + "auxiliary_loss_clip": 0.0149893, + "auxiliary_loss_mlp": 0.01263352, + "balance_loss_clip": 1.1358217, + "balance_loss_mlp": 1.02169073, + "epoch": 0.6247820597607167, + "flos": 14320943681760.0, + "grad_norm": 1.7672747001543052, + "language_loss": 0.72187972, + "learning_rate": 1.303693174644347e-06, + "loss": 0.7495026, + "num_input_tokens_seen": 111905840, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.41210938, + "step": 5196, + "time_per_iteration": 3.134584665298462 + }, + { + "auxiliary_loss_clip": 0.01499642, + "auxiliary_loss_mlp": 0.01267588, + "balance_loss_clip": 1.13817275, + "balance_loss_mlp": 1.02421045, + "epoch": 0.6249023026513557, + "flos": 22640189361120.0, + "grad_norm": 2.029503396981691, + "language_loss": 0.80716109, + "learning_rate": 1.3029629901877625e-06, + "loss": 0.83483338, + "num_input_tokens_seen": 111925215, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.4296875, + "step": 5197, + "time_per_iteration": 2.965000867843628 + }, + { + "auxiliary_loss_clip": 0.01504995, + "auxiliary_loss_mlp": 0.01268108, + "balance_loss_clip": 1.14163315, + "balance_loss_mlp": 1.02606511, + "epoch": 0.6250225455419949, + "flos": 20268783332640.0, + "grad_norm": 2.665773493520689, + "language_loss": 0.77316755, + "learning_rate": 1.3022329114711376e-06, + "loss": 0.80089855, + "num_input_tokens_seen": 111943925, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.41601562, + "step": 5198, + "time_per_iteration": 3.8529160022735596 + }, + { + "auxiliary_loss_clip": 0.01503729, + "auxiliary_loss_mlp": 0.01276967, + "balance_loss_clip": 1.13950157, + "balance_loss_mlp": 1.03530622, + "epoch": 0.6251427884326339, + "flos": 23439950557920.0, + "grad_norm": 1.8610620757432577, + "language_loss": 0.69917583, + "learning_rate": 1.3015029386052256e-06, + "loss": 0.72698283, + "num_input_tokens_seen": 111964095, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 2.41210938, + "step": 5199, + "time_per_iteration": 3.017815351486206 + }, + { + "auxiliary_loss_clip": 0.01500836, + "auxiliary_loss_mlp": 0.01276874, + "balance_loss_clip": 1.13936687, + "balance_loss_mlp": 1.03311467, + "epoch": 0.625263031323273, + "flos": 31725781164000.0, + "grad_norm": 1.9382694179843198, + "language_loss": 0.73036265, + "learning_rate": 1.3007730717007622e-06, + "loss": 0.75813973, + "num_input_tokens_seen": 111984910, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.43359375, + "step": 5200, + "time_per_iteration": 4.019382476806641 + }, + { + "auxiliary_loss_clip": 0.01504551, + "auxiliary_loss_mlp": 0.01269139, + "balance_loss_clip": 1.14232075, + "balance_loss_mlp": 1.02576184, + "epoch": 0.6253832742139122, + "flos": 24136659851040.0, + "grad_norm": 1.8645012909480554, + "language_loss": 0.7590515, + "learning_rate": 1.3000433108684676e-06, + "loss": 0.7867884, + "num_input_tokens_seen": 112005410, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.4296875, + "step": 5201, + "time_per_iteration": 3.1132709980010986 + }, + { + "auxiliary_loss_clip": 0.01497337, + "auxiliary_loss_mlp": 0.01268757, + "balance_loss_clip": 1.13467395, + "balance_loss_mlp": 1.02518904, + "epoch": 0.6255035171045512, + "flos": 27671068707840.0, + "grad_norm": 3.0633083452161594, + "language_loss": 0.80400473, + "learning_rate": 1.2993136562190467e-06, + "loss": 0.83166564, + "num_input_tokens_seen": 112024530, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.43164062, + "step": 5202, + "time_per_iteration": 3.0788021087646484 + }, + { + "auxiliary_loss_clip": 0.0150015, + "auxiliary_loss_mlp": 0.01270836, + "balance_loss_clip": 1.13850987, + "balance_loss_mlp": 1.02688646, + "epoch": 0.6256237599951903, + "flos": 20229754891680.0, + "grad_norm": 1.8806058821582812, + "language_loss": 0.70415056, + "learning_rate": 1.2985841078631871e-06, + "loss": 0.73186052, + "num_input_tokens_seen": 112043850, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.43554688, + "step": 5203, + "time_per_iteration": 3.0211920738220215 + }, + { + "auxiliary_loss_clip": 0.01492843, + "auxiliary_loss_mlp": 0.01270029, + "balance_loss_clip": 1.12979352, + "balance_loss_mlp": 1.02531672, + "epoch": 0.6257440028858293, + "flos": 24172881608160.0, + "grad_norm": 1.9357082494572595, + "language_loss": 0.78321624, + "learning_rate": 1.2978546659115608e-06, + "loss": 0.81084502, + "num_input_tokens_seen": 112061930, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.44335938, + "step": 5204, + "time_per_iteration": 3.89323353767395 + }, + { + "auxiliary_loss_clip": 0.01503197, + "auxiliary_loss_mlp": 0.01264785, + "balance_loss_clip": 1.14095044, + "balance_loss_mlp": 1.02350509, + "epoch": 0.6258642457764685, + "flos": 15853787641440.0, + "grad_norm": 3.4180947095367853, + "language_loss": 0.85153979, + "learning_rate": 1.2971253304748228e-06, + "loss": 0.87921959, + "num_input_tokens_seen": 112079645, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.40820312, + "step": 5205, + "time_per_iteration": 3.1576461791992188 + }, + { + "auxiliary_loss_clip": 0.01500887, + "auxiliary_loss_mlp": 0.01270739, + "balance_loss_clip": 1.13800001, + "balance_loss_mlp": 1.02602625, + "epoch": 0.6259844886671075, + "flos": 11912974542720.0, + "grad_norm": 1.9094489090544795, + "language_loss": 0.75329, + "learning_rate": 1.296396101663614e-06, + "loss": 0.78100634, + "num_input_tokens_seen": 112096205, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.44335938, + "step": 5206, + "time_per_iteration": 2.995671033859253 + }, + { + "auxiliary_loss_clip": 0.0149819, + "auxiliary_loss_mlp": 0.01270689, + "balance_loss_clip": 1.13517559, + "balance_loss_mlp": 1.02979124, + "epoch": 0.6261047315577466, + "flos": 15889781829600.0, + "grad_norm": 2.2113265798730417, + "language_loss": 0.84488577, + "learning_rate": 1.2956669795885565e-06, + "loss": 0.87257457, + "num_input_tokens_seen": 112112835, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.40429688, + "step": 5207, + "time_per_iteration": 3.0299904346466064 + }, + { + "auxiliary_loss_clip": 0.01499523, + "auxiliary_loss_mlp": 0.01274744, + "balance_loss_clip": 1.13645172, + "balance_loss_mlp": 1.03251123, + "epoch": 0.6262249744483858, + "flos": 31251636499680.0, + "grad_norm": 2.510153406736603, + "language_loss": 0.68362761, + "learning_rate": 1.294937964360259e-06, + "loss": 0.71137029, + "num_input_tokens_seen": 112133105, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.41796875, + "step": 5208, + "time_per_iteration": 3.031524896621704 + }, + { + "auxiliary_loss_clip": 0.01493371, + "auxiliary_loss_mlp": 0.01264882, + "balance_loss_clip": 1.12934947, + "balance_loss_mlp": 1.02341151, + "epoch": 0.6263452173390248, + "flos": 27201058212960.0, + "grad_norm": 2.353076191466281, + "language_loss": 0.71633947, + "learning_rate": 1.2942090560893108e-06, + "loss": 0.743922, + "num_input_tokens_seen": 112152510, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.41015625, + "step": 5209, + "time_per_iteration": 3.0591678619384766 + }, + { + "auxiliary_loss_clip": 0.01497468, + "auxiliary_loss_mlp": 0.01270645, + "balance_loss_clip": 1.13585579, + "balance_loss_mlp": 1.03031921, + "epoch": 0.6264654602296639, + "flos": 37345121742240.0, + "grad_norm": 2.890673665164061, + "language_loss": 0.60215569, + "learning_rate": 1.2934802548862882e-06, + "loss": 0.6298368, + "num_input_tokens_seen": 112175295, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.3984375, + "step": 5210, + "time_per_iteration": 3.9627058506011963 + }, + { + "auxiliary_loss_clip": 0.01491007, + "auxiliary_loss_mlp": 0.01262753, + "balance_loss_clip": 1.12790251, + "balance_loss_mlp": 1.02185512, + "epoch": 0.626585703120303, + "flos": 14758714876320.0, + "grad_norm": 2.0509358918803433, + "language_loss": 0.82819021, + "learning_rate": 1.292751560861749e-06, + "loss": 0.85572791, + "num_input_tokens_seen": 112190200, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.40429688, + "step": 5211, + "time_per_iteration": 3.0470049381256104 + }, + { + "auxiliary_loss_clip": 0.01496551, + "auxiliary_loss_mlp": 0.01260602, + "balance_loss_clip": 1.13467085, + "balance_loss_mlp": 1.01817858, + "epoch": 0.6267059460109421, + "flos": 22349580884640.0, + "grad_norm": 1.7511079674397805, + "language_loss": 0.79705262, + "learning_rate": 1.2920229741262354e-06, + "loss": 0.82462418, + "num_input_tokens_seen": 112208205, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.41992188, + "step": 5212, + "time_per_iteration": 3.0792124271392822 + }, + { + "auxiliary_loss_clip": 0.01495605, + "auxiliary_loss_mlp": 0.0126748, + "balance_loss_clip": 1.13332868, + "balance_loss_mlp": 1.0273453, + "epoch": 0.6268261889015811, + "flos": 17750745580320.0, + "grad_norm": 1.9588433427829972, + "language_loss": 0.75137687, + "learning_rate": 1.2912944947902739e-06, + "loss": 0.77900767, + "num_input_tokens_seen": 112224690, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.39648438, + "step": 5213, + "time_per_iteration": 2.993417978286743 + }, + { + "auxiliary_loss_clip": 0.0149249, + "auxiliary_loss_mlp": 0.01265295, + "balance_loss_clip": 1.12987447, + "balance_loss_mlp": 1.02153552, + "epoch": 0.6269464317922203, + "flos": 32848845275520.0, + "grad_norm": 2.8813850243739156, + "language_loss": 0.71525776, + "learning_rate": 1.2905661229643742e-06, + "loss": 0.74283558, + "num_input_tokens_seen": 112244450, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.43359375, + "step": 5214, + "time_per_iteration": 3.0499749183654785 + }, + { + "auxiliary_loss_clip": 0.01496142, + "auxiliary_loss_mlp": 0.01276901, + "balance_loss_clip": 1.13283098, + "balance_loss_mlp": 1.03333282, + "epoch": 0.6270666746828594, + "flos": 17931019946400.0, + "grad_norm": 2.153200517459351, + "language_loss": 0.84402657, + "learning_rate": 1.2898378587590299e-06, + "loss": 0.87175703, + "num_input_tokens_seen": 112261050, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.43164062, + "step": 5215, + "time_per_iteration": 3.054710626602173 + }, + { + "auxiliary_loss_clip": 0.01493881, + "auxiliary_loss_mlp": 0.01264116, + "balance_loss_clip": 1.13165426, + "balance_loss_mlp": 1.02207339, + "epoch": 0.6271869175734984, + "flos": 17459264756160.0, + "grad_norm": 1.9607663944250837, + "language_loss": 0.87723982, + "learning_rate": 1.2891097022847173e-06, + "loss": 0.90481973, + "num_input_tokens_seen": 112278395, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.41601562, + "step": 5216, + "time_per_iteration": 2.970569610595703 + }, + { + "auxiliary_loss_clip": 0.01491673, + "auxiliary_loss_mlp": 0.01276973, + "balance_loss_clip": 1.12838125, + "balance_loss_mlp": 1.03531229, + "epoch": 0.6273071604641376, + "flos": 26870966157600.0, + "grad_norm": 1.8034644300348213, + "language_loss": 0.66613948, + "learning_rate": 1.2883816536518978e-06, + "loss": 0.69382596, + "num_input_tokens_seen": 112299535, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.41210938, + "step": 5217, + "time_per_iteration": 3.124866008758545 + }, + { + "auxiliary_loss_clip": 0.01491985, + "auxiliary_loss_mlp": 0.01261237, + "balance_loss_clip": 1.12984324, + "balance_loss_mlp": 1.02148283, + "epoch": 0.6274274033547766, + "flos": 26064491676480.0, + "grad_norm": 1.8903421055065666, + "language_loss": 0.82141173, + "learning_rate": 1.2876537129710155e-06, + "loss": 0.84894389, + "num_input_tokens_seen": 112317265, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.39257812, + "step": 5218, + "time_per_iteration": 3.1123292446136475 + }, + { + "auxiliary_loss_clip": 0.01501897, + "auxiliary_loss_mlp": 0.01279108, + "balance_loss_clip": 1.13801885, + "balance_loss_mlp": 1.03306055, + "epoch": 0.6275476462454157, + "flos": 20268328194720.0, + "grad_norm": 2.478478444967074, + "language_loss": 0.7506904, + "learning_rate": 1.286925880352499e-06, + "loss": 0.77850044, + "num_input_tokens_seen": 112336125, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.45703125, + "step": 5219, + "time_per_iteration": 3.003967523574829 + }, + { + "auxiliary_loss_clip": 0.01496139, + "auxiliary_loss_mlp": 0.01271349, + "balance_loss_clip": 1.13299775, + "balance_loss_mlp": 1.02835274, + "epoch": 0.6276678891360549, + "flos": 26322671211840.0, + "grad_norm": 1.6158797832570313, + "language_loss": 0.71159583, + "learning_rate": 1.2861981559067592e-06, + "loss": 0.73927069, + "num_input_tokens_seen": 112356730, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.42578125, + "step": 5220, + "time_per_iteration": 3.0945937633514404 + }, + { + "auxiliary_loss_clip": 0.01487714, + "auxiliary_loss_mlp": 0.01277211, + "balance_loss_clip": 1.12398291, + "balance_loss_mlp": 1.03650367, + "epoch": 0.6277881320266939, + "flos": 13913932589280.0, + "grad_norm": 2.7133146991941213, + "language_loss": 0.79990244, + "learning_rate": 1.2854705397441917e-06, + "loss": 0.82755166, + "num_input_tokens_seen": 112372270, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.40234375, + "step": 5221, + "time_per_iteration": 3.0003650188446045 + }, + { + "auxiliary_loss_clip": 0.01492506, + "auxiliary_loss_mlp": 0.01268007, + "balance_loss_clip": 1.13124728, + "balance_loss_mlp": 1.02558291, + "epoch": 0.627908374917333, + "flos": 27051164667360.0, + "grad_norm": 5.319866542018381, + "language_loss": 0.77521586, + "learning_rate": 1.2847430319751747e-06, + "loss": 0.80282098, + "num_input_tokens_seen": 112390365, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.41992188, + "step": 5222, + "time_per_iteration": 3.0042500495910645 + }, + { + "auxiliary_loss_clip": 0.01498224, + "auxiliary_loss_mlp": 0.0127419, + "balance_loss_clip": 1.13493812, + "balance_loss_mlp": 1.03424573, + "epoch": 0.6280286178079721, + "flos": 23771483883360.0, + "grad_norm": 2.7961047207966097, + "language_loss": 0.67178547, + "learning_rate": 1.2840156327100712e-06, + "loss": 0.69950962, + "num_input_tokens_seen": 112407490, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.39453125, + "step": 5223, + "time_per_iteration": 3.067301034927368 + }, + { + "auxiliary_loss_clip": 0.01496806, + "auxiliary_loss_mlp": 0.01261934, + "balance_loss_clip": 1.13491094, + "balance_loss_mlp": 1.02122688, + "epoch": 0.6281488606986112, + "flos": 26361547940160.0, + "grad_norm": 2.4758347233117806, + "language_loss": 0.72381419, + "learning_rate": 1.2832883420592272e-06, + "loss": 0.75140154, + "num_input_tokens_seen": 112426385, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.40234375, + "step": 5224, + "time_per_iteration": 3.0385026931762695 + }, + { + "auxiliary_loss_clip": 0.01502813, + "auxiliary_loss_mlp": 0.01264943, + "balance_loss_clip": 1.14020681, + "balance_loss_mlp": 1.02423525, + "epoch": 0.6282691035892503, + "flos": 36140132076480.0, + "grad_norm": 2.645482175201078, + "language_loss": 0.64335299, + "learning_rate": 1.282561160132972e-06, + "loss": 0.67103046, + "num_input_tokens_seen": 112446905, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.40234375, + "step": 5225, + "time_per_iteration": 3.173187255859375 + }, + { + "auxiliary_loss_clip": 0.0149648, + "auxiliary_loss_mlp": 0.01278331, + "balance_loss_clip": 1.13423526, + "balance_loss_mlp": 1.03762388, + "epoch": 0.6283893464798894, + "flos": 26539394904000.0, + "grad_norm": 1.7771816592227798, + "language_loss": 0.80937058, + "learning_rate": 1.2818340870416186e-06, + "loss": 0.83711863, + "num_input_tokens_seen": 112468040, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.40234375, + "step": 5226, + "time_per_iteration": 3.9732182025909424 + }, + { + "auxiliary_loss_clip": 0.01497662, + "auxiliary_loss_mlp": 0.01267189, + "balance_loss_clip": 1.13554955, + "balance_loss_mlp": 1.02362037, + "epoch": 0.6285095893705285, + "flos": 22239436415040.0, + "grad_norm": 1.8566316425526679, + "language_loss": 0.75925833, + "learning_rate": 1.2811071228954626e-06, + "loss": 0.78690684, + "num_input_tokens_seen": 112486675, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.43164062, + "step": 5227, + "time_per_iteration": 3.944495439529419 + }, + { + "auxiliary_loss_clip": 0.01499444, + "auxiliary_loss_mlp": 0.01277384, + "balance_loss_clip": 1.1387558, + "balance_loss_mlp": 1.03763056, + "epoch": 0.6286298322611675, + "flos": 26544704846400.0, + "grad_norm": 2.1706065195358804, + "language_loss": 0.81322038, + "learning_rate": 1.2803802678047846e-06, + "loss": 0.84098876, + "num_input_tokens_seen": 112506825, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.39257812, + "step": 5228, + "time_per_iteration": 3.070711612701416 + }, + { + "auxiliary_loss_clip": 0.01498779, + "auxiliary_loss_mlp": 0.0127131, + "balance_loss_clip": 1.13684916, + "balance_loss_mlp": 1.02736068, + "epoch": 0.6287500751518067, + "flos": 21797189697600.0, + "grad_norm": 1.9867116203072548, + "language_loss": 0.74585599, + "learning_rate": 1.279653521879848e-06, + "loss": 0.77355689, + "num_input_tokens_seen": 112526890, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.43554688, + "step": 5229, + "time_per_iteration": 3.1338894367218018 + }, + { + "auxiliary_loss_clip": 0.01492544, + "auxiliary_loss_mlp": 0.01266292, + "balance_loss_clip": 1.1306957, + "balance_loss_mlp": 1.02520323, + "epoch": 0.6288703180424458, + "flos": 20011589929440.0, + "grad_norm": 2.105768227656004, + "language_loss": 0.841254, + "learning_rate": 1.2789268852308997e-06, + "loss": 0.86884236, + "num_input_tokens_seen": 112542100, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.40625, + "step": 5230, + "time_per_iteration": 2.9777884483337402 + }, + { + "auxiliary_loss_clip": 0.01498084, + "auxiliary_loss_mlp": 0.01271352, + "balance_loss_clip": 1.13580585, + "balance_loss_mlp": 1.0289284, + "epoch": 0.6289905609330848, + "flos": 22126978327680.0, + "grad_norm": 2.532235824501096, + "language_loss": 0.7082535, + "learning_rate": 1.2782003579681688e-06, + "loss": 0.73594785, + "num_input_tokens_seen": 112561630, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.41992188, + "step": 5231, + "time_per_iteration": 3.8968214988708496 + }, + { + "auxiliary_loss_clip": 0.01501495, + "auxiliary_loss_mlp": 0.01272488, + "balance_loss_clip": 1.14053845, + "balance_loss_mlp": 1.03139949, + "epoch": 0.629110803823724, + "flos": 25520748109920.0, + "grad_norm": 1.9299048252539597, + "language_loss": 0.74482918, + "learning_rate": 1.2774739402018701e-06, + "loss": 0.77256906, + "num_input_tokens_seen": 112582465, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.40625, + "step": 5232, + "time_per_iteration": 3.1411867141723633 + }, + { + "auxiliary_loss_clip": 0.01498615, + "auxiliary_loss_mlp": 0.01271933, + "balance_loss_clip": 1.13676286, + "balance_loss_mlp": 1.03046298, + "epoch": 0.629231046714363, + "flos": 20888725301280.0, + "grad_norm": 1.561207383287844, + "language_loss": 0.73082465, + "learning_rate": 1.2767476320422002e-06, + "loss": 0.75853014, + "num_input_tokens_seen": 112602390, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.41015625, + "step": 5233, + "time_per_iteration": 3.0379221439361572 + }, + { + "auxiliary_loss_clip": 0.01471383, + "auxiliary_loss_mlp": 0.01229576, + "balance_loss_clip": 1.11726642, + "balance_loss_mlp": 1.02892303, + "epoch": 0.6293512896050021, + "flos": 65057532144480.0, + "grad_norm": 0.7007638944604632, + "language_loss": 0.57162935, + "learning_rate": 1.2760214335993392e-06, + "loss": 0.59863895, + "num_input_tokens_seen": 112669035, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.0078125, + "step": 5234, + "time_per_iteration": 3.626209020614624 + }, + { + "auxiliary_loss_clip": 0.01492282, + "auxiliary_loss_mlp": 0.01268014, + "balance_loss_clip": 1.13041711, + "balance_loss_mlp": 1.02978635, + "epoch": 0.6294715324956413, + "flos": 34680717763200.0, + "grad_norm": 2.012705352318339, + "language_loss": 0.58886933, + "learning_rate": 1.2752953449834514e-06, + "loss": 0.61647236, + "num_input_tokens_seen": 112691485, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.37695312, + "step": 5235, + "time_per_iteration": 3.1925883293151855 + }, + { + "auxiliary_loss_clip": 0.01496453, + "auxiliary_loss_mlp": 0.01266597, + "balance_loss_clip": 1.13505137, + "balance_loss_mlp": 1.02550828, + "epoch": 0.6295917753862803, + "flos": 22786441803360.0, + "grad_norm": 1.8147445731105036, + "language_loss": 0.80549842, + "learning_rate": 1.2745693663046836e-06, + "loss": 0.83312893, + "num_input_tokens_seen": 112710555, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.40625, + "step": 5236, + "time_per_iteration": 3.1718671321868896 + }, + { + "auxiliary_loss_clip": 0.01492277, + "auxiliary_loss_mlp": 0.01270956, + "balance_loss_clip": 1.12987185, + "balance_loss_mlp": 1.03310966, + "epoch": 0.6297120182769194, + "flos": 20852579400480.0, + "grad_norm": 1.7772309399709167, + "language_loss": 0.80795276, + "learning_rate": 1.2738434976731662e-06, + "loss": 0.83558512, + "num_input_tokens_seen": 112728740, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.37304688, + "step": 5237, + "time_per_iteration": 3.0377607345581055 + }, + { + "auxiliary_loss_clip": 0.01499521, + "auxiliary_loss_mlp": 0.01267479, + "balance_loss_clip": 1.13665104, + "balance_loss_mlp": 1.02753448, + "epoch": 0.6298322611675584, + "flos": 19499554668960.0, + "grad_norm": 1.7325494963896388, + "language_loss": 0.7543081, + "learning_rate": 1.2731177391990125e-06, + "loss": 0.78197813, + "num_input_tokens_seen": 112748665, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.39453125, + "step": 5238, + "time_per_iteration": 4.010354995727539 + }, + { + "auxiliary_loss_clip": 0.01487326, + "auxiliary_loss_mlp": 0.01263222, + "balance_loss_clip": 1.12549257, + "balance_loss_mlp": 1.02289629, + "epoch": 0.6299525040581976, + "flos": 12606118588800.0, + "grad_norm": 2.094700456899407, + "language_loss": 0.81749582, + "learning_rate": 1.2723920909923203e-06, + "loss": 0.84500134, + "num_input_tokens_seen": 112764410, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.3984375, + "step": 5239, + "time_per_iteration": 3.011918306350708 + }, + { + "auxiliary_loss_clip": 0.01464035, + "auxiliary_loss_mlp": 0.01204117, + "balance_loss_clip": 1.10976243, + "balance_loss_mlp": 1.00575256, + "epoch": 0.6300727469488366, + "flos": 57731255894880.0, + "grad_norm": 0.8592432582406198, + "language_loss": 0.60369331, + "learning_rate": 1.2716665531631688e-06, + "loss": 0.63037485, + "num_input_tokens_seen": 112818695, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 1.98046875, + "step": 5240, + "time_per_iteration": 3.4814352989196777 + }, + { + "auxiliary_loss_clip": 0.01487593, + "auxiliary_loss_mlp": 0.01264947, + "balance_loss_clip": 1.12388682, + "balance_loss_mlp": 1.02156997, + "epoch": 0.6301929898394757, + "flos": 22529438040960.0, + "grad_norm": 2.1802402781173122, + "language_loss": 0.77211469, + "learning_rate": 1.270941125821623e-06, + "loss": 0.79964012, + "num_input_tokens_seen": 112839120, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 2.4296875, + "step": 5241, + "time_per_iteration": 3.124326229095459 + }, + { + "auxiliary_loss_clip": 0.01485694, + "auxiliary_loss_mlp": 0.01265999, + "balance_loss_clip": 1.12407136, + "balance_loss_mlp": 1.02510118, + "epoch": 0.6303132327301149, + "flos": 28296093049920.0, + "grad_norm": 1.6249160392121824, + "language_loss": 0.75610381, + "learning_rate": 1.2702158090777278e-06, + "loss": 0.78362072, + "num_input_tokens_seen": 112860210, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.40429688, + "step": 5242, + "time_per_iteration": 3.1158714294433594 + }, + { + "auxiliary_loss_clip": 0.01487597, + "auxiliary_loss_mlp": 0.01272374, + "balance_loss_clip": 1.12496185, + "balance_loss_mlp": 1.03185773, + "epoch": 0.6304334756207539, + "flos": 25266854456640.0, + "grad_norm": 1.9395907947362843, + "language_loss": 0.74816406, + "learning_rate": 1.2694906030415148e-06, + "loss": 0.77576381, + "num_input_tokens_seen": 112877955, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.40039062, + "step": 5243, + "time_per_iteration": 3.0900495052337646 + }, + { + "auxiliary_loss_clip": 0.01488279, + "auxiliary_loss_mlp": 0.0126876, + "balance_loss_clip": 1.12603283, + "balance_loss_mlp": 1.02309382, + "epoch": 0.630553718511393, + "flos": 18035095910400.0, + "grad_norm": 9.312070169651191, + "language_loss": 0.82219732, + "learning_rate": 1.2687655078229958e-06, + "loss": 0.84976768, + "num_input_tokens_seen": 112892285, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.453125, + "step": 5244, + "time_per_iteration": 3.02606201171875 + }, + { + "auxiliary_loss_clip": 0.01486127, + "auxiliary_loss_mlp": 0.01270184, + "balance_loss_clip": 1.12365854, + "balance_loss_mlp": 1.02947712, + "epoch": 0.6306739614020321, + "flos": 27306347878080.0, + "grad_norm": 2.323459904715865, + "language_loss": 0.69062328, + "learning_rate": 1.2680405235321678e-06, + "loss": 0.71818638, + "num_input_tokens_seen": 112913620, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.40234375, + "step": 5245, + "time_per_iteration": 3.1541738510131836 + }, + { + "auxiliary_loss_clip": 0.01497063, + "auxiliary_loss_mlp": 0.01271169, + "balance_loss_clip": 1.13527238, + "balance_loss_mlp": 1.03008044, + "epoch": 0.6307942042926712, + "flos": 15343269507360.0, + "grad_norm": 1.9880595727825836, + "language_loss": 0.7882514, + "learning_rate": 1.267315650279011e-06, + "loss": 0.8159337, + "num_input_tokens_seen": 112932090, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.40625, + "step": 5246, + "time_per_iteration": 3.15258526802063 + }, + { + "auxiliary_loss_clip": 0.01486784, + "auxiliary_loss_mlp": 0.01269087, + "balance_loss_clip": 1.1250602, + "balance_loss_mlp": 1.02666283, + "epoch": 0.6309144471833102, + "flos": 19608257868480.0, + "grad_norm": 2.549120376948468, + "language_loss": 0.74117768, + "learning_rate": 1.2665908881734874e-06, + "loss": 0.76873636, + "num_input_tokens_seen": 112950925, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.41992188, + "step": 5247, + "time_per_iteration": 3.2036097049713135 + }, + { + "auxiliary_loss_clip": 0.0148569, + "auxiliary_loss_mlp": 0.01263091, + "balance_loss_clip": 1.12195516, + "balance_loss_mlp": 1.02257419, + "epoch": 0.6310346900739494, + "flos": 17495258944320.0, + "grad_norm": 2.2766782183692653, + "language_loss": 0.85366237, + "learning_rate": 1.2658662373255432e-06, + "loss": 0.88115013, + "num_input_tokens_seen": 112969315, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.40039062, + "step": 5248, + "time_per_iteration": 3.1466665267944336 + }, + { + "auxiliary_loss_clip": 0.01459229, + "auxiliary_loss_mlp": 0.01209228, + "balance_loss_clip": 1.10538042, + "balance_loss_mlp": 1.01086426, + "epoch": 0.6311549329645885, + "flos": 55076030530560.0, + "grad_norm": 0.7117526428278687, + "language_loss": 0.52173245, + "learning_rate": 1.2651416978451063e-06, + "loss": 0.54841709, + "num_input_tokens_seen": 113034700, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 1.984375, + "step": 5249, + "time_per_iteration": 3.5935850143432617 + }, + { + "auxiliary_loss_clip": 0.01491993, + "auxiliary_loss_mlp": 0.01272808, + "balance_loss_clip": 1.12805331, + "balance_loss_mlp": 1.02962112, + "epoch": 0.6312751758552275, + "flos": 41905838881440.0, + "grad_norm": 2.4431168905889744, + "language_loss": 0.65610093, + "learning_rate": 1.2644172698420903e-06, + "loss": 0.68374896, + "num_input_tokens_seen": 113056805, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.42773438, + "step": 5250, + "time_per_iteration": 3.135377883911133 + }, + { + "auxiliary_loss_clip": 0.01485705, + "auxiliary_loss_mlp": 0.01267109, + "balance_loss_clip": 1.12253666, + "balance_loss_mlp": 1.02544832, + "epoch": 0.6313954187458667, + "flos": 19648879292160.0, + "grad_norm": 2.214952681606562, + "language_loss": 0.84950829, + "learning_rate": 1.2636929534263892e-06, + "loss": 0.87703645, + "num_input_tokens_seen": 113075790, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.41210938, + "step": 5251, + "time_per_iteration": 2.985598087310791 + }, + { + "auxiliary_loss_clip": 0.01482922, + "auxiliary_loss_mlp": 0.01260319, + "balance_loss_clip": 1.12055361, + "balance_loss_mlp": 1.01903915, + "epoch": 0.6315156616365057, + "flos": 22896548344800.0, + "grad_norm": 1.8801687953590498, + "language_loss": 0.77552307, + "learning_rate": 1.2629687487078821e-06, + "loss": 0.80295545, + "num_input_tokens_seen": 113094600, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.40820312, + "step": 5252, + "time_per_iteration": 3.0115411281585693 + }, + { + "auxiliary_loss_clip": 0.01483121, + "auxiliary_loss_mlp": 0.01261501, + "balance_loss_clip": 1.11804914, + "balance_loss_mlp": 1.0215565, + "epoch": 0.6316359045271448, + "flos": 23728169560320.0, + "grad_norm": 2.0725420117404236, + "language_loss": 0.76827443, + "learning_rate": 1.2622446557964293e-06, + "loss": 0.7957207, + "num_input_tokens_seen": 113112605, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.39453125, + "step": 5253, + "time_per_iteration": 2.9694013595581055 + }, + { + "auxiliary_loss_clip": 0.01483708, + "auxiliary_loss_mlp": 0.01263626, + "balance_loss_clip": 1.12043953, + "balance_loss_mlp": 1.02272832, + "epoch": 0.631756147417784, + "flos": 33111158980320.0, + "grad_norm": 2.9033659342218483, + "language_loss": 0.71643555, + "learning_rate": 1.261520674801876e-06, + "loss": 0.74390894, + "num_input_tokens_seen": 113133200, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 2.40429688, + "step": 5254, + "time_per_iteration": 4.776504755020142 + }, + { + "auxiliary_loss_clip": 0.01491045, + "auxiliary_loss_mlp": 0.01268241, + "balance_loss_clip": 1.1273104, + "balance_loss_mlp": 1.02524447, + "epoch": 0.631876390308423, + "flos": 31251105505440.0, + "grad_norm": 2.0728757848992037, + "language_loss": 0.72581637, + "learning_rate": 1.2607968058340488e-06, + "loss": 0.75340927, + "num_input_tokens_seen": 113152895, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.42578125, + "step": 5255, + "time_per_iteration": 3.094712257385254 + }, + { + "auxiliary_loss_clip": 0.01486698, + "auxiliary_loss_mlp": 0.01267318, + "balance_loss_clip": 1.12260318, + "balance_loss_mlp": 1.02756429, + "epoch": 0.6319966331990621, + "flos": 24683475598560.0, + "grad_norm": 1.98488611041921, + "language_loss": 0.73429906, + "learning_rate": 1.2600730490027583e-06, + "loss": 0.76183927, + "num_input_tokens_seen": 113173135, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 2.39257812, + "step": 5256, + "time_per_iteration": 3.121887445449829 + }, + { + "auxiliary_loss_clip": 0.01483587, + "auxiliary_loss_mlp": 0.01261567, + "balance_loss_clip": 1.12100375, + "balance_loss_mlp": 1.02333903, + "epoch": 0.6321168760897012, + "flos": 17493590105280.0, + "grad_norm": 1.973107224920688, + "language_loss": 0.80426741, + "learning_rate": 1.2593494044177984e-06, + "loss": 0.83171892, + "num_input_tokens_seen": 113191440, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.37695312, + "step": 5257, + "time_per_iteration": 3.1949334144592285 + }, + { + "auxiliary_loss_clip": 0.01484141, + "auxiliary_loss_mlp": 0.01274985, + "balance_loss_clip": 1.11929011, + "balance_loss_mlp": 1.03217936, + "epoch": 0.6322371189803403, + "flos": 18297295830720.0, + "grad_norm": 3.7379713836526043, + "language_loss": 0.80882972, + "learning_rate": 1.2586258721889448e-06, + "loss": 0.83642101, + "num_input_tokens_seen": 113208790, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 2.42382812, + "step": 5258, + "time_per_iteration": 3.9887380599975586 + }, + { + "auxiliary_loss_clip": 0.01487778, + "auxiliary_loss_mlp": 0.01267157, + "balance_loss_clip": 1.1239512, + "balance_loss_mlp": 1.02721298, + "epoch": 0.6323573618709794, + "flos": 20159359498080.0, + "grad_norm": 2.256555428011579, + "language_loss": 0.81591487, + "learning_rate": 1.2579024524259573e-06, + "loss": 0.84346426, + "num_input_tokens_seen": 113225050, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.39453125, + "step": 5259, + "time_per_iteration": 3.0172665119171143 + }, + { + "auxiliary_loss_clip": 0.01485398, + "auxiliary_loss_mlp": 0.01266901, + "balance_loss_clip": 1.12121248, + "balance_loss_mlp": 1.02657557, + "epoch": 0.6324776047616185, + "flos": 20044170583200.0, + "grad_norm": 1.8721822008018254, + "language_loss": 0.91238004, + "learning_rate": 1.2571791452385768e-06, + "loss": 0.93990302, + "num_input_tokens_seen": 113242315, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.3984375, + "step": 5260, + "time_per_iteration": 3.0827832221984863 + }, + { + "auxiliary_loss_clip": 0.01484556, + "auxiliary_loss_mlp": 0.0126706, + "balance_loss_clip": 1.12185287, + "balance_loss_mlp": 1.02902293, + "epoch": 0.6325978476522576, + "flos": 30850959409920.0, + "grad_norm": 2.0022133694689064, + "language_loss": 0.77418935, + "learning_rate": 1.2564559507365301e-06, + "loss": 0.80170548, + "num_input_tokens_seen": 113264720, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.375, + "step": 5261, + "time_per_iteration": 3.1174912452697754 + }, + { + "auxiliary_loss_clip": 0.01488399, + "auxiliary_loss_mlp": 0.01269431, + "balance_loss_clip": 1.12419462, + "balance_loss_mlp": 1.02967763, + "epoch": 0.6327180905428966, + "flos": 24537412797120.0, + "grad_norm": 2.219556060252349, + "language_loss": 0.7908777, + "learning_rate": 1.2557328690295244e-06, + "loss": 0.81845599, + "num_input_tokens_seen": 113282910, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 2.39257812, + "step": 5262, + "time_per_iteration": 3.169400453567505 + }, + { + "auxiliary_loss_clip": 0.01486836, + "auxiliary_loss_mlp": 0.01277706, + "balance_loss_clip": 1.12355351, + "balance_loss_mlp": 1.03661692, + "epoch": 0.6328383334335358, + "flos": 21577166255520.0, + "grad_norm": 1.7969907867604542, + "language_loss": 0.76689756, + "learning_rate": 1.255009900227251e-06, + "loss": 0.79454291, + "num_input_tokens_seen": 113301935, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 2.40625, + "step": 5263, + "time_per_iteration": 3.105571746826172 + }, + { + "auxiliary_loss_clip": 0.01490672, + "auxiliary_loss_mlp": 0.01268873, + "balance_loss_clip": 1.12682939, + "balance_loss_mlp": 1.02854693, + "epoch": 0.6329585763241748, + "flos": 22932087395040.0, + "grad_norm": 2.5035038766099795, + "language_loss": 0.79836071, + "learning_rate": 1.254287044439383e-06, + "loss": 0.82595611, + "num_input_tokens_seen": 113321540, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 2.3984375, + "step": 5264, + "time_per_iteration": 3.0754663944244385 + }, + { + "auxiliary_loss_clip": 0.01463428, + "auxiliary_loss_mlp": 0.01214714, + "balance_loss_clip": 1.10888696, + "balance_loss_mlp": 1.01634979, + "epoch": 0.6330788192148139, + "flos": 70943396817600.0, + "grad_norm": 0.7775591076535707, + "language_loss": 0.54422313, + "learning_rate": 1.2535643017755776e-06, + "loss": 0.57100451, + "num_input_tokens_seen": 113383730, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 1.98046875, + "step": 5265, + "time_per_iteration": 3.551525592803955 + }, + { + "auxiliary_loss_clip": 0.01486205, + "auxiliary_loss_mlp": 0.01279319, + "balance_loss_clip": 1.12344885, + "balance_loss_mlp": 1.04128194, + "epoch": 0.6331990621054531, + "flos": 21246239780640.0, + "grad_norm": 2.249799163288453, + "language_loss": 0.7233569, + "learning_rate": 1.2528416723454737e-06, + "loss": 0.75101209, + "num_input_tokens_seen": 113400400, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 2.375, + "step": 5266, + "time_per_iteration": 3.8807213306427 + }, + { + "auxiliary_loss_clip": 0.01491833, + "auxiliary_loss_mlp": 0.01272099, + "balance_loss_clip": 1.12929773, + "balance_loss_mlp": 1.03253555, + "epoch": 0.6333193049960921, + "flos": 34462287303840.0, + "grad_norm": 1.742526641175215, + "language_loss": 0.70683539, + "learning_rate": 1.2521191562586945e-06, + "loss": 0.73447472, + "num_input_tokens_seen": 113424050, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.390625, + "step": 5267, + "time_per_iteration": 3.1946003437042236 + }, + { + "auxiliary_loss_clip": 0.01490717, + "auxiliary_loss_mlp": 0.01274638, + "balance_loss_clip": 1.12842178, + "balance_loss_mlp": 1.0339303, + "epoch": 0.6334395478867312, + "flos": 18331697036160.0, + "grad_norm": 2.015506764788158, + "language_loss": 0.77340931, + "learning_rate": 1.2513967536248445e-06, + "loss": 0.80106282, + "num_input_tokens_seen": 113440370, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.40234375, + "step": 5268, + "time_per_iteration": 3.0754966735839844 + }, + { + "auxiliary_loss_clip": 0.01491253, + "auxiliary_loss_mlp": 0.01264169, + "balance_loss_clip": 1.12928987, + "balance_loss_mlp": 1.02517772, + "epoch": 0.6335597907773702, + "flos": 23625421081920.0, + "grad_norm": 1.7167434711915959, + "language_loss": 0.80725282, + "learning_rate": 1.2506744645535117e-06, + "loss": 0.83480698, + "num_input_tokens_seen": 113460800, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.38476562, + "step": 5269, + "time_per_iteration": 3.1776559352874756 + }, + { + "auxiliary_loss_clip": 0.01485883, + "auxiliary_loss_mlp": 0.01268639, + "balance_loss_clip": 1.12482405, + "balance_loss_mlp": 1.02812243, + "epoch": 0.6336800336680094, + "flos": 22713125941440.0, + "grad_norm": 2.1764465617038575, + "language_loss": 0.59580147, + "learning_rate": 1.249952289154267e-06, + "loss": 0.62334669, + "num_input_tokens_seen": 113480840, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.40039062, + "step": 5270, + "time_per_iteration": 3.044051170349121 + }, + { + "auxiliary_loss_clip": 0.01494987, + "auxiliary_loss_mlp": 0.01273375, + "balance_loss_clip": 1.13340306, + "balance_loss_mlp": 1.0362916, + "epoch": 0.6338002765586485, + "flos": 23625003872160.0, + "grad_norm": 2.5082056941376853, + "language_loss": 0.76964772, + "learning_rate": 1.2492302275366635e-06, + "loss": 0.79733133, + "num_input_tokens_seen": 113500515, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.36523438, + "step": 5271, + "time_per_iteration": 3.0153987407684326 + }, + { + "auxiliary_loss_clip": 0.01494146, + "auxiliary_loss_mlp": 0.0126795, + "balance_loss_clip": 1.13199496, + "balance_loss_mlp": 1.02667046, + "epoch": 0.6339205194492875, + "flos": 26507876238720.0, + "grad_norm": 2.4525300949283877, + "language_loss": 0.65346336, + "learning_rate": 1.2485082798102377e-06, + "loss": 0.68108433, + "num_input_tokens_seen": 113520930, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.40820312, + "step": 5272, + "time_per_iteration": 3.0419180393218994 + }, + { + "auxiliary_loss_clip": 0.01489272, + "auxiliary_loss_mlp": 0.01278413, + "balance_loss_clip": 1.12669933, + "balance_loss_mlp": 1.03675234, + "epoch": 0.6340407623399267, + "flos": 18545879541600.0, + "grad_norm": 2.6307958866893992, + "language_loss": 0.68656707, + "learning_rate": 1.2477864460845084e-06, + "loss": 0.71424395, + "num_input_tokens_seen": 113537330, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.41210938, + "step": 5273, + "time_per_iteration": 2.922523021697998 + }, + { + "auxiliary_loss_clip": 0.01494081, + "auxiliary_loss_mlp": 0.01271883, + "balance_loss_clip": 1.13275683, + "balance_loss_mlp": 1.02907753, + "epoch": 0.6341610052305657, + "flos": 17714523823200.0, + "grad_norm": 2.7317315950104644, + "language_loss": 0.73819858, + "learning_rate": 1.2470647264689776e-06, + "loss": 0.76585823, + "num_input_tokens_seen": 113555810, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.42382812, + "step": 5274, + "time_per_iteration": 2.9475231170654297 + }, + { + "auxiliary_loss_clip": 0.01485656, + "auxiliary_loss_mlp": 0.01272333, + "balance_loss_clip": 1.1234045, + "balance_loss_mlp": 1.03143501, + "epoch": 0.6342812481212048, + "flos": 23589313109280.0, + "grad_norm": 2.437371888971462, + "language_loss": 0.71414363, + "learning_rate": 1.2463431210731282e-06, + "loss": 0.74172354, + "num_input_tokens_seen": 113575395, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.40429688, + "step": 5275, + "time_per_iteration": 3.023014783859253 + }, + { + "auxiliary_loss_clip": 0.01493543, + "auxiliary_loss_mlp": 0.01273489, + "balance_loss_clip": 1.13268328, + "balance_loss_mlp": 1.03373575, + "epoch": 0.634401491011844, + "flos": 17823871801440.0, + "grad_norm": 2.63493962970093, + "language_loss": 0.76301992, + "learning_rate": 1.2456216300064289e-06, + "loss": 0.79069024, + "num_input_tokens_seen": 113592945, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.39257812, + "step": 5276, + "time_per_iteration": 2.9075114727020264 + }, + { + "auxiliary_loss_clip": 0.01494303, + "auxiliary_loss_mlp": 0.01265303, + "balance_loss_clip": 1.13399434, + "balance_loss_mlp": 1.0236423, + "epoch": 0.634521733902483, + "flos": 21360139138080.0, + "grad_norm": 2.2826849034682812, + "language_loss": 0.78419733, + "learning_rate": 1.244900253378328e-06, + "loss": 0.81179345, + "num_input_tokens_seen": 113613000, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.41210938, + "step": 5277, + "time_per_iteration": 3.147439479827881 + }, + { + "auxiliary_loss_clip": 0.01491677, + "auxiliary_loss_mlp": 0.01262867, + "balance_loss_clip": 1.13077807, + "balance_loss_mlp": 1.0221591, + "epoch": 0.6346419767931221, + "flos": 16546931687520.0, + "grad_norm": 2.1077434737379734, + "language_loss": 0.69733977, + "learning_rate": 1.2441789912982583e-06, + "loss": 0.72488523, + "num_input_tokens_seen": 113630085, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.40234375, + "step": 5278, + "time_per_iteration": 2.945261001586914 + }, + { + "auxiliary_loss_clip": 0.01493314, + "auxiliary_loss_mlp": 0.012716, + "balance_loss_clip": 1.13150036, + "balance_loss_mlp": 1.02936625, + "epoch": 0.6347622196837612, + "flos": 24353193902400.0, + "grad_norm": 1.9129235710540977, + "language_loss": 0.64648992, + "learning_rate": 1.2434578438756346e-06, + "loss": 0.67413902, + "num_input_tokens_seen": 113650515, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.41796875, + "step": 5279, + "time_per_iteration": 3.071185350418091 + }, + { + "auxiliary_loss_clip": 0.01486288, + "auxiliary_loss_mlp": 0.0126095, + "balance_loss_clip": 1.12459517, + "balance_loss_mlp": 1.02310336, + "epoch": 0.6348824625744003, + "flos": 64527708941280.0, + "grad_norm": 1.8783810609088536, + "language_loss": 0.77662969, + "learning_rate": 1.242736811219855e-06, + "loss": 0.80410206, + "num_input_tokens_seen": 113676475, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.37304688, + "step": 5280, + "time_per_iteration": 3.394937038421631 + }, + { + "auxiliary_loss_clip": 0.01495499, + "auxiliary_loss_mlp": 0.01262054, + "balance_loss_clip": 1.13474345, + "balance_loss_mlp": 1.02382588, + "epoch": 0.6350027054650393, + "flos": 28624705907040.0, + "grad_norm": 2.013876724955046, + "language_loss": 0.82238746, + "learning_rate": 1.2420158934402988e-06, + "loss": 0.84996301, + "num_input_tokens_seen": 113697090, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.37695312, + "step": 5281, + "time_per_iteration": 3.981950283050537 + }, + { + "auxiliary_loss_clip": 0.01491523, + "auxiliary_loss_mlp": 0.01270107, + "balance_loss_clip": 1.13009953, + "balance_loss_mlp": 1.02959037, + "epoch": 0.6351229483556785, + "flos": 23004758478240.0, + "grad_norm": 2.075865925465325, + "language_loss": 0.8489477, + "learning_rate": 1.2412950906463286e-06, + "loss": 0.87656403, + "num_input_tokens_seen": 113714395, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.40039062, + "step": 5282, + "time_per_iteration": 3.784625768661499 + }, + { + "auxiliary_loss_clip": 0.01491834, + "auxiliary_loss_mlp": 0.01269834, + "balance_loss_clip": 1.13110328, + "balance_loss_mlp": 1.02989006, + "epoch": 0.6352431912463176, + "flos": 21941166450240.0, + "grad_norm": 2.4480881180115817, + "language_loss": 0.90073442, + "learning_rate": 1.2405744029472902e-06, + "loss": 0.92835104, + "num_input_tokens_seen": 113733880, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.39453125, + "step": 5283, + "time_per_iteration": 2.996860980987549 + }, + { + "auxiliary_loss_clip": 0.01490294, + "auxiliary_loss_mlp": 0.01274616, + "balance_loss_clip": 1.12780285, + "balance_loss_mlp": 1.03448105, + "epoch": 0.6353634341369566, + "flos": 13736957973120.0, + "grad_norm": 2.1488507283623206, + "language_loss": 0.76441383, + "learning_rate": 1.2398538304525108e-06, + "loss": 0.79206288, + "num_input_tokens_seen": 113752505, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.39648438, + "step": 5284, + "time_per_iteration": 2.9483323097229004 + }, + { + "auxiliary_loss_clip": 0.01497886, + "auxiliary_loss_mlp": 0.01274498, + "balance_loss_clip": 1.13536799, + "balance_loss_mlp": 1.03188336, + "epoch": 0.6354836770275958, + "flos": 19318104529920.0, + "grad_norm": 3.0986971304682935, + "language_loss": 0.76070285, + "learning_rate": 1.2391333732713016e-06, + "loss": 0.7884267, + "num_input_tokens_seen": 113770310, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.421875, + "step": 5285, + "time_per_iteration": 2.986628293991089 + }, + { + "auxiliary_loss_clip": 0.01498203, + "auxiliary_loss_mlp": 0.01272827, + "balance_loss_clip": 1.13756728, + "balance_loss_mlp": 1.03002191, + "epoch": 0.6356039199182348, + "flos": 21615360276960.0, + "grad_norm": 2.0923915233530117, + "language_loss": 0.78590435, + "learning_rate": 1.2384130315129543e-06, + "loss": 0.81361467, + "num_input_tokens_seen": 113788635, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.42382812, + "step": 5286, + "time_per_iteration": 3.881117105484009 + }, + { + "auxiliary_loss_clip": 0.01492562, + "auxiliary_loss_mlp": 0.0126815, + "balance_loss_clip": 1.13034558, + "balance_loss_mlp": 1.02839684, + "epoch": 0.6357241628088739, + "flos": 18113835499200.0, + "grad_norm": 3.236515944063252, + "language_loss": 0.74039459, + "learning_rate": 1.2376928052867447e-06, + "loss": 0.76800174, + "num_input_tokens_seen": 113807755, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.39257812, + "step": 5287, + "time_per_iteration": 2.93373441696167 + }, + { + "auxiliary_loss_clip": 0.01497702, + "auxiliary_loss_mlp": 0.01270698, + "balance_loss_clip": 1.13749897, + "balance_loss_mlp": 1.03132594, + "epoch": 0.6358444056995131, + "flos": 24937520964480.0, + "grad_norm": 2.0746469009558397, + "language_loss": 0.77497321, + "learning_rate": 1.2369726947019299e-06, + "loss": 0.80265725, + "num_input_tokens_seen": 113828230, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.38867188, + "step": 5288, + "time_per_iteration": 3.031636953353882 + }, + { + "auxiliary_loss_clip": 0.0149155, + "auxiliary_loss_mlp": 0.01261184, + "balance_loss_clip": 1.13053131, + "balance_loss_mlp": 1.0242908, + "epoch": 0.6359646485901521, + "flos": 23295518667360.0, + "grad_norm": 2.4894324835097725, + "language_loss": 0.66863394, + "learning_rate": 1.2362526998677511e-06, + "loss": 0.69616127, + "num_input_tokens_seen": 113844595, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.36328125, + "step": 5289, + "time_per_iteration": 2.9634673595428467 + }, + { + "auxiliary_loss_clip": 0.01490592, + "auxiliary_loss_mlp": 0.01266587, + "balance_loss_clip": 1.12982059, + "balance_loss_mlp": 1.0312202, + "epoch": 0.6360848914807912, + "flos": 20889597648960.0, + "grad_norm": 6.024139819142218, + "language_loss": 0.8433699, + "learning_rate": 1.2355328208934301e-06, + "loss": 0.87094164, + "num_input_tokens_seen": 113863470, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.34765625, + "step": 5290, + "time_per_iteration": 2.9264345169067383 + }, + { + "auxiliary_loss_clip": 0.0148954, + "auxiliary_loss_mlp": 0.01263252, + "balance_loss_clip": 1.12765265, + "balance_loss_mlp": 1.02349818, + "epoch": 0.6362051343714303, + "flos": 18481818150720.0, + "grad_norm": 1.7227672119105537, + "language_loss": 0.72540045, + "learning_rate": 1.2348130578881728e-06, + "loss": 0.75292832, + "num_input_tokens_seen": 113881690, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.39257812, + "step": 5291, + "time_per_iteration": 3.088376045227051 + }, + { + "auxiliary_loss_clip": 0.01493171, + "auxiliary_loss_mlp": 0.01278401, + "balance_loss_clip": 1.13274956, + "balance_loss_mlp": 1.03559518, + "epoch": 0.6363253772620694, + "flos": 24391918918080.0, + "grad_norm": 2.308235187322495, + "language_loss": 0.76305044, + "learning_rate": 1.2340934109611664e-06, + "loss": 0.79076618, + "num_input_tokens_seen": 113902450, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.42382812, + "step": 5292, + "time_per_iteration": 3.056382656097412 + }, + { + "auxiliary_loss_clip": 0.01495724, + "auxiliary_loss_mlp": 0.01275742, + "balance_loss_clip": 1.13369, + "balance_loss_mlp": 1.03465331, + "epoch": 0.6364456201527084, + "flos": 25960643281440.0, + "grad_norm": 2.4911343467057603, + "language_loss": 0.68408918, + "learning_rate": 1.2333738802215798e-06, + "loss": 0.71180385, + "num_input_tokens_seen": 113922670, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.40625, + "step": 5293, + "time_per_iteration": 3.040027379989624 + }, + { + "auxiliary_loss_clip": 0.01496219, + "auxiliary_loss_mlp": 0.0126653, + "balance_loss_clip": 1.13442683, + "balance_loss_mlp": 1.02772975, + "epoch": 0.6365658630433476, + "flos": 20742814212480.0, + "grad_norm": 2.0571890402548254, + "language_loss": 0.81090832, + "learning_rate": 1.2326544657785668e-06, + "loss": 0.83853579, + "num_input_tokens_seen": 113942360, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.3828125, + "step": 5294, + "time_per_iteration": 3.883866786956787 + }, + { + "auxiliary_loss_clip": 0.01502157, + "auxiliary_loss_mlp": 0.01274607, + "balance_loss_clip": 1.14171565, + "balance_loss_mlp": 1.03504372, + "epoch": 0.6366861059339867, + "flos": 21436299612000.0, + "grad_norm": 2.5130074171573757, + "language_loss": 0.74584174, + "learning_rate": 1.2319351677412608e-06, + "loss": 0.7736094, + "num_input_tokens_seen": 113959405, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.390625, + "step": 5295, + "time_per_iteration": 2.9629082679748535 + }, + { + "auxiliary_loss_clip": 0.01495377, + "auxiliary_loss_mlp": 0.01264753, + "balance_loss_clip": 1.13395727, + "balance_loss_mlp": 1.0244267, + "epoch": 0.6368063488246257, + "flos": 22268944887840.0, + "grad_norm": 2.4682526185390237, + "language_loss": 0.74094397, + "learning_rate": 1.2312159862187796e-06, + "loss": 0.76854527, + "num_input_tokens_seen": 113977815, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.3984375, + "step": 5296, + "time_per_iteration": 2.9949159622192383 + }, + { + "auxiliary_loss_clip": 0.01498384, + "auxiliary_loss_mlp": 0.01280885, + "balance_loss_clip": 1.13764381, + "balance_loss_mlp": 1.03865135, + "epoch": 0.6369265917152649, + "flos": 22423010531040.0, + "grad_norm": 1.621691978566837, + "language_loss": 0.76155627, + "learning_rate": 1.2304969213202217e-06, + "loss": 0.78934896, + "num_input_tokens_seen": 113999075, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.41796875, + "step": 5297, + "time_per_iteration": 3.003971815109253 + }, + { + "auxiliary_loss_clip": 0.01494219, + "auxiliary_loss_mlp": 0.01268407, + "balance_loss_clip": 1.13308239, + "balance_loss_mlp": 1.02884352, + "epoch": 0.6370468346059039, + "flos": 24720531775200.0, + "grad_norm": 9.602615261805218, + "language_loss": 0.79195321, + "learning_rate": 1.2297779731546692e-06, + "loss": 0.81957942, + "num_input_tokens_seen": 114018170, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.390625, + "step": 5298, + "time_per_iteration": 2.9918806552886963 + }, + { + "auxiliary_loss_clip": 0.01499812, + "auxiliary_loss_mlp": 0.012656, + "balance_loss_clip": 1.13829207, + "balance_loss_mlp": 1.02660942, + "epoch": 0.637167077496543, + "flos": 25298676547200.0, + "grad_norm": 2.4682941078846596, + "language_loss": 0.78071821, + "learning_rate": 1.2290591418311853e-06, + "loss": 0.80837238, + "num_input_tokens_seen": 114035565, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.38476562, + "step": 5299, + "time_per_iteration": 3.102607488632202 + }, + { + "auxiliary_loss_clip": 0.01493895, + "auxiliary_loss_mlp": 0.01274653, + "balance_loss_clip": 1.13287842, + "balance_loss_mlp": 1.03432691, + "epoch": 0.637287320387182, + "flos": 27673496110080.0, + "grad_norm": 1.6557543096672642, + "language_loss": 0.7205075, + "learning_rate": 1.2283404274588172e-06, + "loss": 0.74819297, + "num_input_tokens_seen": 114054510, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.3984375, + "step": 5300, + "time_per_iteration": 3.043957471847534 + }, + { + "auxiliary_loss_clip": 0.01466088, + "auxiliary_loss_mlp": 0.01202797, + "balance_loss_clip": 1.11435914, + "balance_loss_mlp": 1.00519562, + "epoch": 0.6374075632778212, + "flos": 63179917931520.0, + "grad_norm": 0.7419807982891731, + "language_loss": 0.52788687, + "learning_rate": 1.227621830146592e-06, + "loss": 0.55457568, + "num_input_tokens_seen": 114109875, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 1.97265625, + "step": 5301, + "time_per_iteration": 3.382305860519409 + }, + { + "auxiliary_loss_clip": 0.01494433, + "auxiliary_loss_mlp": 0.01260504, + "balance_loss_clip": 1.13314927, + "balance_loss_mlp": 1.02055967, + "epoch": 0.6375278061684603, + "flos": 25560573042240.0, + "grad_norm": 3.0722493080776694, + "language_loss": 0.78957421, + "learning_rate": 1.2269033500035217e-06, + "loss": 0.81712359, + "num_input_tokens_seen": 114130010, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.39453125, + "step": 5302, + "time_per_iteration": 3.0521109104156494 + }, + { + "auxiliary_loss_clip": 0.01494263, + "auxiliary_loss_mlp": 0.0126405, + "balance_loss_clip": 1.13388896, + "balance_loss_mlp": 1.02372444, + "epoch": 0.6376480490590993, + "flos": 25668745247520.0, + "grad_norm": 1.8851622297191393, + "language_loss": 0.73778367, + "learning_rate": 1.2261849871385988e-06, + "loss": 0.76536679, + "num_input_tokens_seen": 114151115, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.3984375, + "step": 5303, + "time_per_iteration": 3.110527992248535 + }, + { + "auxiliary_loss_clip": 0.01494951, + "auxiliary_loss_mlp": 0.01266719, + "balance_loss_clip": 1.13459289, + "balance_loss_mlp": 1.02543986, + "epoch": 0.6377682919497385, + "flos": 31540234783680.0, + "grad_norm": 2.1897766333579582, + "language_loss": 0.62800932, + "learning_rate": 1.2254667416607972e-06, + "loss": 0.65562606, + "num_input_tokens_seen": 114172715, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.40820312, + "step": 5304, + "time_per_iteration": 3.1560800075531006 + }, + { + "auxiliary_loss_clip": 0.01492613, + "auxiliary_loss_mlp": 0.01275409, + "balance_loss_clip": 1.13038778, + "balance_loss_mlp": 1.03489268, + "epoch": 0.6378885348403776, + "flos": 23041435373280.0, + "grad_norm": 1.9046022679336727, + "language_loss": 0.83128458, + "learning_rate": 1.2247486136790756e-06, + "loss": 0.8589648, + "num_input_tokens_seen": 114192195, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 2.40039062, + "step": 5305, + "time_per_iteration": 3.0552070140838623 + }, + { + "auxiliary_loss_clip": 0.01496621, + "auxiliary_loss_mlp": 0.01268641, + "balance_loss_clip": 1.13624191, + "balance_loss_mlp": 1.03022242, + "epoch": 0.6380087777310166, + "flos": 18699034908960.0, + "grad_norm": 3.247422316533528, + "language_loss": 0.80632979, + "learning_rate": 1.2240306033023726e-06, + "loss": 0.83398241, + "num_input_tokens_seen": 114210020, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.37890625, + "step": 5306, + "time_per_iteration": 3.08577561378479 + }, + { + "auxiliary_loss_clip": 0.0149414, + "auxiliary_loss_mlp": 0.01262908, + "balance_loss_clip": 1.13369322, + "balance_loss_mlp": 1.02048421, + "epoch": 0.6381290206216558, + "flos": 23333826473280.0, + "grad_norm": 2.488582728565761, + "language_loss": 0.72390711, + "learning_rate": 1.223312710639611e-06, + "loss": 0.7514776, + "num_input_tokens_seen": 114228740, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.41992188, + "step": 5307, + "time_per_iteration": 3.0400404930114746 + }, + { + "auxiliary_loss_clip": 0.01506892, + "auxiliary_loss_mlp": 0.01263593, + "balance_loss_clip": 1.14651275, + "balance_loss_mlp": 1.02059639, + "epoch": 0.6382492635122948, + "flos": 18882495240480.0, + "grad_norm": 2.133609122136513, + "language_loss": 0.86802006, + "learning_rate": 1.2225949357996928e-06, + "loss": 0.89572489, + "num_input_tokens_seen": 114246865, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.42578125, + "step": 5308, + "time_per_iteration": 3.0130295753479004 + }, + { + "auxiliary_loss_clip": 0.01491785, + "auxiliary_loss_mlp": 0.01266167, + "balance_loss_clip": 1.13199937, + "balance_loss_mlp": 1.02679491, + "epoch": 0.6383695064029339, + "flos": 27821644960320.0, + "grad_norm": 1.702246442382565, + "language_loss": 0.80448467, + "learning_rate": 1.221877278891505e-06, + "loss": 0.83206421, + "num_input_tokens_seen": 114266120, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.38867188, + "step": 5309, + "time_per_iteration": 3.9146549701690674 + }, + { + "auxiliary_loss_clip": 0.01494723, + "auxiliary_loss_mlp": 0.01271589, + "balance_loss_clip": 1.13115227, + "balance_loss_mlp": 1.0310719, + "epoch": 0.638489749293573, + "flos": 26398035194400.0, + "grad_norm": 8.211260711311747, + "language_loss": 0.71424949, + "learning_rate": 1.221159740023915e-06, + "loss": 0.7419126, + "num_input_tokens_seen": 114285950, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 2.40039062, + "step": 5310, + "time_per_iteration": 3.0712177753448486 + }, + { + "auxiliary_loss_clip": 0.01499602, + "auxiliary_loss_mlp": 0.01259606, + "balance_loss_clip": 1.14037752, + "balance_loss_mlp": 1.01756334, + "epoch": 0.6386099921842121, + "flos": 23990634977760.0, + "grad_norm": 5.3598464137529485, + "language_loss": 0.72408348, + "learning_rate": 1.2204423193057735e-06, + "loss": 0.75167549, + "num_input_tokens_seen": 114304780, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.41601562, + "step": 5311, + "time_per_iteration": 3.0034217834472656 + }, + { + "auxiliary_loss_clip": 0.01463485, + "auxiliary_loss_mlp": 0.01202957, + "balance_loss_clip": 1.11233544, + "balance_loss_mlp": 1.00611877, + "epoch": 0.6387302350748512, + "flos": 71737241221440.0, + "grad_norm": 0.8519316186105809, + "language_loss": 0.63335645, + "learning_rate": 1.2197250168459122e-06, + "loss": 0.66002083, + "num_input_tokens_seen": 114361180, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 1.96484375, + "step": 5312, + "time_per_iteration": 3.5188426971435547 + }, + { + "auxiliary_loss_clip": 0.01501706, + "auxiliary_loss_mlp": 0.01273145, + "balance_loss_clip": 1.14129114, + "balance_loss_mlp": 1.03453529, + "epoch": 0.6388504779654903, + "flos": 14537401876800.0, + "grad_norm": 3.3897374882655673, + "language_loss": 0.74012184, + "learning_rate": 1.2190078327531454e-06, + "loss": 0.76787031, + "num_input_tokens_seen": 114377425, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.38085938, + "step": 5313, + "time_per_iteration": 3.00691819190979 + }, + { + "auxiliary_loss_clip": 0.01495038, + "auxiliary_loss_mlp": 0.01266029, + "balance_loss_clip": 1.13399816, + "balance_loss_mlp": 1.02741933, + "epoch": 0.6389707208561294, + "flos": 22348329255360.0, + "grad_norm": 1.6348013345613461, + "language_loss": 0.72896498, + "learning_rate": 1.2182907671362697e-06, + "loss": 0.75657564, + "num_input_tokens_seen": 114398120, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.38085938, + "step": 5314, + "time_per_iteration": 3.9325320720672607 + }, + { + "auxiliary_loss_clip": 0.01497376, + "auxiliary_loss_mlp": 0.01270557, + "balance_loss_clip": 1.13594198, + "balance_loss_mlp": 1.03080297, + "epoch": 0.6390909637467684, + "flos": 19428893778240.0, + "grad_norm": 4.582106622461537, + "language_loss": 0.78647697, + "learning_rate": 1.2175738201040626e-06, + "loss": 0.81415629, + "num_input_tokens_seen": 114415160, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.39257812, + "step": 5315, + "time_per_iteration": 3.097170829772949 + }, + { + "auxiliary_loss_clip": 0.01496143, + "auxiliary_loss_mlp": 0.01284005, + "balance_loss_clip": 1.13450789, + "balance_loss_mlp": 1.04272532, + "epoch": 0.6392112066374076, + "flos": 24092776605600.0, + "grad_norm": 2.0244879850551705, + "language_loss": 0.78733611, + "learning_rate": 1.2168569917652855e-06, + "loss": 0.81513762, + "num_input_tokens_seen": 114435015, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.40820312, + "step": 5316, + "time_per_iteration": 3.014451265335083 + }, + { + "auxiliary_loss_clip": 0.01494364, + "auxiliary_loss_mlp": 0.01271886, + "balance_loss_clip": 1.1332407, + "balance_loss_mlp": 1.03175092, + "epoch": 0.6393314495280467, + "flos": 26797460654880.0, + "grad_norm": 11.19127994302844, + "language_loss": 0.63991994, + "learning_rate": 1.2161402822286797e-06, + "loss": 0.66758239, + "num_input_tokens_seen": 114455700, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.39648438, + "step": 5317, + "time_per_iteration": 2.9964799880981445 + }, + { + "auxiliary_loss_clip": 0.01497071, + "auxiliary_loss_mlp": 0.01272345, + "balance_loss_clip": 1.13526821, + "balance_loss_mlp": 1.03144729, + "epoch": 0.6394516924186857, + "flos": 20262449329920.0, + "grad_norm": 2.2077729509578368, + "language_loss": 0.7894994, + "learning_rate": 1.2154236916029703e-06, + "loss": 0.81719363, + "num_input_tokens_seen": 114473675, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.40429688, + "step": 5318, + "time_per_iteration": 3.182832956314087 + }, + { + "auxiliary_loss_clip": 0.01492935, + "auxiliary_loss_mlp": 0.01268552, + "balance_loss_clip": 1.13315821, + "balance_loss_mlp": 1.02879882, + "epoch": 0.6395719353093249, + "flos": 18370801333440.0, + "grad_norm": 2.6868138075096097, + "language_loss": 0.73767459, + "learning_rate": 1.2147072199968627e-06, + "loss": 0.76528943, + "num_input_tokens_seen": 114492310, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.39257812, + "step": 5319, + "time_per_iteration": 3.0725650787353516 + }, + { + "auxiliary_loss_clip": 0.01491367, + "auxiliary_loss_mlp": 0.01265666, + "balance_loss_clip": 1.13108146, + "balance_loss_mlp": 1.0253396, + "epoch": 0.6396921781999639, + "flos": 17568233452800.0, + "grad_norm": 2.036379409017831, + "language_loss": 0.71585536, + "learning_rate": 1.2139908675190454e-06, + "loss": 0.74342573, + "num_input_tokens_seen": 114511520, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.3984375, + "step": 5320, + "time_per_iteration": 3.7869622707366943 + }, + { + "auxiliary_loss_clip": 0.01495198, + "auxiliary_loss_mlp": 0.01269982, + "balance_loss_clip": 1.13395631, + "balance_loss_mlp": 1.02889299, + "epoch": 0.639812421090603, + "flos": 21253370274720.0, + "grad_norm": 2.4194513217510605, + "language_loss": 0.75245512, + "learning_rate": 1.2132746342781883e-06, + "loss": 0.7801069, + "num_input_tokens_seen": 114532680, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.40625, + "step": 5321, + "time_per_iteration": 3.069561004638672 + }, + { + "auxiliary_loss_clip": 0.01493428, + "auxiliary_loss_mlp": 0.01270709, + "balance_loss_clip": 1.13308382, + "balance_loss_mlp": 1.02885699, + "epoch": 0.6399326639812422, + "flos": 11182508822880.0, + "grad_norm": 5.825275225377163, + "language_loss": 0.80162466, + "learning_rate": 1.2125585203829442e-06, + "loss": 0.82926595, + "num_input_tokens_seen": 114548320, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.4140625, + "step": 5322, + "time_per_iteration": 3.096024513244629 + }, + { + "auxiliary_loss_clip": 0.01495036, + "auxiliary_loss_mlp": 0.01276811, + "balance_loss_clip": 1.13287067, + "balance_loss_mlp": 1.03686643, + "epoch": 0.6400529068718812, + "flos": 23913109090080.0, + "grad_norm": 1.9707701044972765, + "language_loss": 0.74652845, + "learning_rate": 1.211842525941946e-06, + "loss": 0.77424681, + "num_input_tokens_seen": 114568115, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.39453125, + "step": 5323, + "time_per_iteration": 3.005441665649414 + }, + { + "auxiliary_loss_clip": 0.01488039, + "auxiliary_loss_mlp": 0.01266656, + "balance_loss_clip": 1.12784028, + "balance_loss_mlp": 1.02804649, + "epoch": 0.6401731497625203, + "flos": 44022137555520.0, + "grad_norm": 3.081080442479947, + "language_loss": 0.792395, + "learning_rate": 1.2111266510638105e-06, + "loss": 0.81994194, + "num_input_tokens_seen": 114591040, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.3828125, + "step": 5324, + "time_per_iteration": 3.190656900405884 + }, + { + "auxiliary_loss_clip": 0.0149702, + "auxiliary_loss_mlp": 0.01271169, + "balance_loss_clip": 1.1362083, + "balance_loss_mlp": 1.02969861, + "epoch": 0.6402933926531594, + "flos": 20663998767360.0, + "grad_norm": 2.069548976910836, + "language_loss": 0.80231315, + "learning_rate": 1.2104108958571346e-06, + "loss": 0.82999504, + "num_input_tokens_seen": 114609310, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.41015625, + "step": 5325, + "time_per_iteration": 3.0747597217559814 + }, + { + "auxiliary_loss_clip": 0.0149444, + "auxiliary_loss_mlp": 0.01265235, + "balance_loss_clip": 1.13388777, + "balance_loss_mlp": 1.02662539, + "epoch": 0.6404136355437985, + "flos": 24865646372640.0, + "grad_norm": 1.9133070208670084, + "language_loss": 0.75678957, + "learning_rate": 1.2096952604304975e-06, + "loss": 0.78438628, + "num_input_tokens_seen": 114629740, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.38085938, + "step": 5326, + "time_per_iteration": 3.096269130706787 + }, + { + "auxiliary_loss_clip": 0.01492211, + "auxiliary_loss_mlp": 0.01274598, + "balance_loss_clip": 1.13108253, + "balance_loss_mlp": 1.033319, + "epoch": 0.6405338784344375, + "flos": 40482760109760.0, + "grad_norm": 3.023437796019863, + "language_loss": 0.70606017, + "learning_rate": 1.2089797448924616e-06, + "loss": 0.73372829, + "num_input_tokens_seen": 114653615, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.40820312, + "step": 5327, + "time_per_iteration": 3.192497730255127 + }, + { + "auxiliary_loss_clip": 0.01493798, + "auxiliary_loss_mlp": 0.0126834, + "balance_loss_clip": 1.13227916, + "balance_loss_mlp": 1.02477145, + "epoch": 0.6406541213250767, + "flos": 20888118450720.0, + "grad_norm": 2.121103037374763, + "language_loss": 0.66068143, + "learning_rate": 1.2082643493515692e-06, + "loss": 0.68830281, + "num_input_tokens_seen": 114671935, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.43164062, + "step": 5328, + "time_per_iteration": 2.976036310195923 + }, + { + "auxiliary_loss_clip": 0.01491642, + "auxiliary_loss_mlp": 0.01267006, + "balance_loss_clip": 1.13072348, + "balance_loss_mlp": 1.02801561, + "epoch": 0.6407743642157158, + "flos": 23298287423040.0, + "grad_norm": 6.143189328909176, + "language_loss": 0.81521535, + "learning_rate": 1.207549073916346e-06, + "loss": 0.84280187, + "num_input_tokens_seen": 114692870, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.38476562, + "step": 5329, + "time_per_iteration": 3.0125181674957275 + }, + { + "auxiliary_loss_clip": 0.01491114, + "auxiliary_loss_mlp": 0.012732, + "balance_loss_clip": 1.13009191, + "balance_loss_mlp": 1.03478146, + "epoch": 0.6408946071063548, + "flos": 15014429081280.0, + "grad_norm": 2.2943881912050696, + "language_loss": 0.77762973, + "learning_rate": 1.2068339186952976e-06, + "loss": 0.80527294, + "num_input_tokens_seen": 114710410, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.37890625, + "step": 5330, + "time_per_iteration": 2.9603464603424072 + }, + { + "auxiliary_loss_clip": 0.01490327, + "auxiliary_loss_mlp": 0.01269012, + "balance_loss_clip": 1.12948096, + "balance_loss_mlp": 1.02906799, + "epoch": 0.6410148499969939, + "flos": 22530803454720.0, + "grad_norm": 3.375165063743414, + "language_loss": 0.7338621, + "learning_rate": 1.2061188837969136e-06, + "loss": 0.76145548, + "num_input_tokens_seen": 114730020, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.39453125, + "step": 5331, + "time_per_iteration": 2.96774959564209 + }, + { + "auxiliary_loss_clip": 0.01491646, + "auxiliary_loss_mlp": 0.0126983, + "balance_loss_clip": 1.12929583, + "balance_loss_mlp": 1.02912295, + "epoch": 0.641135092887633, + "flos": 12423644389440.0, + "grad_norm": 3.7490381698291615, + "language_loss": 0.84271598, + "learning_rate": 1.2054039693296631e-06, + "loss": 0.87033075, + "num_input_tokens_seen": 114748015, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 2.40234375, + "step": 5332, + "time_per_iteration": 2.9781394004821777 + }, + { + "auxiliary_loss_clip": 0.01487824, + "auxiliary_loss_mlp": 0.01263878, + "balance_loss_clip": 1.12786984, + "balance_loss_mlp": 1.02603137, + "epoch": 0.6412553357782721, + "flos": 22129519514400.0, + "grad_norm": 1.7560608080795626, + "language_loss": 0.81699502, + "learning_rate": 1.2046891754019992e-06, + "loss": 0.84451199, + "num_input_tokens_seen": 114768625, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.37304688, + "step": 5333, + "time_per_iteration": 3.0322132110595703 + }, + { + "auxiliary_loss_clip": 0.01496337, + "auxiliary_loss_mlp": 0.01267603, + "balance_loss_clip": 1.13541412, + "balance_loss_mlp": 1.02823091, + "epoch": 0.6413755786689112, + "flos": 15890464536480.0, + "grad_norm": 2.1472468743342064, + "language_loss": 0.828861, + "learning_rate": 1.2039745021223548e-06, + "loss": 0.85650039, + "num_input_tokens_seen": 114786045, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.38867188, + "step": 5334, + "time_per_iteration": 2.95524525642395 + }, + { + "auxiliary_loss_clip": 0.01461706, + "auxiliary_loss_mlp": 0.0119986, + "balance_loss_clip": 1.10864174, + "balance_loss_mlp": 1.00073242, + "epoch": 0.6414958215595503, + "flos": 68046680308320.0, + "grad_norm": 0.7954401323051886, + "language_loss": 0.56986308, + "learning_rate": 1.2032599495991456e-06, + "loss": 0.59647876, + "num_input_tokens_seen": 114850785, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 1.984375, + "step": 5335, + "time_per_iteration": 3.5411391258239746 + }, + { + "auxiliary_loss_clip": 0.01495734, + "auxiliary_loss_mlp": 0.01274262, + "balance_loss_clip": 1.13428569, + "balance_loss_mlp": 1.03241003, + "epoch": 0.6416160644501894, + "flos": 44094694854240.0, + "grad_norm": 1.8104617416859259, + "language_loss": 0.69628918, + "learning_rate": 1.2025455179407685e-06, + "loss": 0.72398913, + "num_input_tokens_seen": 114871945, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.4140625, + "step": 5336, + "time_per_iteration": 3.9279210567474365 + }, + { + "auxiliary_loss_clip": 0.01488447, + "auxiliary_loss_mlp": 0.01264642, + "balance_loss_clip": 1.12781024, + "balance_loss_mlp": 1.02546048, + "epoch": 0.6417363073408284, + "flos": 20961965306880.0, + "grad_norm": 1.9794881270008506, + "language_loss": 0.73669869, + "learning_rate": 1.2018312072556022e-06, + "loss": 0.7642296, + "num_input_tokens_seen": 114890445, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 2.38671875, + "step": 5337, + "time_per_iteration": 2.980482816696167 + }, + { + "auxiliary_loss_clip": 0.0149081, + "auxiliary_loss_mlp": 0.01263965, + "balance_loss_clip": 1.12980521, + "balance_loss_mlp": 1.02516484, + "epoch": 0.6418565502314676, + "flos": 22457297952000.0, + "grad_norm": 3.8885581619504843, + "language_loss": 0.74079871, + "learning_rate": 1.2011170176520077e-06, + "loss": 0.76834643, + "num_input_tokens_seen": 114911360, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.3828125, + "step": 5338, + "time_per_iteration": 3.0332577228546143 + }, + { + "auxiliary_loss_clip": 0.01491533, + "auxiliary_loss_mlp": 0.01271059, + "balance_loss_clip": 1.12889814, + "balance_loss_mlp": 1.03283119, + "epoch": 0.6419767931221066, + "flos": 25047817146720.0, + "grad_norm": 1.5882297254070301, + "language_loss": 0.81338811, + "learning_rate": 1.2004029492383256e-06, + "loss": 0.84101397, + "num_input_tokens_seen": 114932700, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.37695312, + "step": 5339, + "time_per_iteration": 3.007492780685425 + }, + { + "auxiliary_loss_clip": 0.01492674, + "auxiliary_loss_mlp": 0.01277756, + "balance_loss_clip": 1.13170302, + "balance_loss_mlp": 1.03743029, + "epoch": 0.6420970360127457, + "flos": 19465722385920.0, + "grad_norm": 2.7518723068183224, + "language_loss": 0.73565125, + "learning_rate": 1.1996890021228814e-06, + "loss": 0.76335549, + "num_input_tokens_seen": 114949475, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.3984375, + "step": 5340, + "time_per_iteration": 3.0143392086029053 + }, + { + "auxiliary_loss_clip": 0.01492508, + "auxiliary_loss_mlp": 0.01265195, + "balance_loss_clip": 1.13101196, + "balance_loss_mlp": 1.02677655, + "epoch": 0.6422172789033849, + "flos": 40409330463360.0, + "grad_norm": 1.6106209992763743, + "language_loss": 0.70072937, + "learning_rate": 1.1989751764139785e-06, + "loss": 0.72830635, + "num_input_tokens_seen": 114973125, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 2.37890625, + "step": 5341, + "time_per_iteration": 3.9438085556030273 + }, + { + "auxiliary_loss_clip": 0.01488812, + "auxiliary_loss_mlp": 0.01271364, + "balance_loss_clip": 1.12728786, + "balance_loss_mlp": 1.02817726, + "epoch": 0.6423375217940239, + "flos": 27675354589920.0, + "grad_norm": 1.7006383788909276, + "language_loss": 0.8317681, + "learning_rate": 1.1982614722199044e-06, + "loss": 0.85936987, + "num_input_tokens_seen": 114994300, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.42773438, + "step": 5342, + "time_per_iteration": 2.976731777191162 + }, + { + "auxiliary_loss_clip": 0.01491233, + "auxiliary_loss_mlp": 0.01271472, + "balance_loss_clip": 1.13095975, + "balance_loss_mlp": 1.03152812, + "epoch": 0.642457764684663, + "flos": 18371218543200.0, + "grad_norm": 2.4549163607156532, + "language_loss": 0.77767026, + "learning_rate": 1.1975478896489276e-06, + "loss": 0.80529732, + "num_input_tokens_seen": 115012135, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.39453125, + "step": 5343, + "time_per_iteration": 2.9759292602539062 + }, + { + "auxiliary_loss_clip": 0.01488055, + "auxiliary_loss_mlp": 0.01268648, + "balance_loss_clip": 1.12555861, + "balance_loss_mlp": 1.03061068, + "epoch": 0.6425780075753021, + "flos": 19752272549280.0, + "grad_norm": 1.9878989746381972, + "language_loss": 0.76419508, + "learning_rate": 1.1968344288092981e-06, + "loss": 0.79176211, + "num_input_tokens_seen": 115028715, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 2.375, + "step": 5344, + "time_per_iteration": 2.904921054840088 + }, + { + "auxiliary_loss_clip": 0.01490823, + "auxiliary_loss_mlp": 0.01268683, + "balance_loss_clip": 1.12987447, + "balance_loss_mlp": 1.02911985, + "epoch": 0.6426982504659412, + "flos": 20560643438400.0, + "grad_norm": 1.7263305260387483, + "language_loss": 0.64717144, + "learning_rate": 1.1961210898092468e-06, + "loss": 0.67476642, + "num_input_tokens_seen": 115047665, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.390625, + "step": 5345, + "time_per_iteration": 3.0115697383880615 + }, + { + "auxiliary_loss_clip": 0.01493322, + "auxiliary_loss_mlp": 0.01290472, + "balance_loss_clip": 1.13287711, + "balance_loss_mlp": 1.04862022, + "epoch": 0.6428184933565803, + "flos": 17853569915040.0, + "grad_norm": 2.5091631985755063, + "language_loss": 0.78968203, + "learning_rate": 1.1954078727569874e-06, + "loss": 0.81751996, + "num_input_tokens_seen": 115064965, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.4140625, + "step": 5346, + "time_per_iteration": 2.9807143211364746 + }, + { + "auxiliary_loss_clip": 0.0149273, + "auxiliary_loss_mlp": 0.01267728, + "balance_loss_clip": 1.13175511, + "balance_loss_mlp": 1.02930987, + "epoch": 0.6429387362472194, + "flos": 22459232288160.0, + "grad_norm": 1.7530919829441678, + "language_loss": 0.7786113, + "learning_rate": 1.1946947777607141e-06, + "loss": 0.80621588, + "num_input_tokens_seen": 115086100, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.37890625, + "step": 5347, + "time_per_iteration": 3.0357136726379395 + }, + { + "auxiliary_loss_clip": 0.01490473, + "auxiliary_loss_mlp": 0.01267463, + "balance_loss_clip": 1.12896156, + "balance_loss_mlp": 1.02656484, + "epoch": 0.6430589791378585, + "flos": 24754819196160.0, + "grad_norm": 3.5032443882543927, + "language_loss": 0.80444956, + "learning_rate": 1.1939818049286024e-06, + "loss": 0.83202893, + "num_input_tokens_seen": 115104260, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.40429688, + "step": 5348, + "time_per_iteration": 3.940448760986328 + }, + { + "auxiliary_loss_clip": 0.01492577, + "auxiliary_loss_mlp": 0.01268547, + "balance_loss_clip": 1.13102984, + "balance_loss_mlp": 1.03241777, + "epoch": 0.6431792220284975, + "flos": 24903650753280.0, + "grad_norm": 2.009594403204055, + "language_loss": 0.75891441, + "learning_rate": 1.1932689543688101e-06, + "loss": 0.78652573, + "num_input_tokens_seen": 115125365, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.35546875, + "step": 5349, + "time_per_iteration": 3.0191001892089844 + }, + { + "auxiliary_loss_clip": 0.01495708, + "auxiliary_loss_mlp": 0.01275538, + "balance_loss_clip": 1.13429034, + "balance_loss_mlp": 1.03196907, + "epoch": 0.6432994649191367, + "flos": 21034408821120.0, + "grad_norm": 2.36092895384328, + "language_loss": 0.72724557, + "learning_rate": 1.1925562261894756e-06, + "loss": 0.75495803, + "num_input_tokens_seen": 115144445, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 2.43164062, + "step": 5350, + "time_per_iteration": 2.9275124073028564 + }, + { + "auxiliary_loss_clip": 0.01494695, + "auxiliary_loss_mlp": 0.01264349, + "balance_loss_clip": 1.13371992, + "balance_loss_mlp": 1.02631235, + "epoch": 0.6434197078097758, + "flos": 30887143238880.0, + "grad_norm": 1.9429027018989729, + "language_loss": 0.77727318, + "learning_rate": 1.1918436204987207e-06, + "loss": 0.80486357, + "num_input_tokens_seen": 115166305, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.375, + "step": 5351, + "time_per_iteration": 3.0010082721710205 + }, + { + "auxiliary_loss_clip": 0.01492775, + "auxiliary_loss_mlp": 0.01267637, + "balance_loss_clip": 1.13044429, + "balance_loss_mlp": 1.02998161, + "epoch": 0.6435399507004148, + "flos": 15014277368640.0, + "grad_norm": 2.3956681336991767, + "language_loss": 0.81870759, + "learning_rate": 1.191131137404645e-06, + "loss": 0.84631169, + "num_input_tokens_seen": 115183045, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 2.37109375, + "step": 5352, + "time_per_iteration": 2.9375863075256348 + }, + { + "auxiliary_loss_clip": 0.01500234, + "auxiliary_loss_mlp": 0.01277023, + "balance_loss_clip": 1.13944006, + "balance_loss_mlp": 1.03860438, + "epoch": 0.643660193591054, + "flos": 19904062502880.0, + "grad_norm": 2.7250107312077074, + "language_loss": 0.76821983, + "learning_rate": 1.190418777015333e-06, + "loss": 0.79599237, + "num_input_tokens_seen": 115201955, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.37890625, + "step": 5353, + "time_per_iteration": 2.9639358520507812 + }, + { + "auxiliary_loss_clip": 0.01494555, + "auxiliary_loss_mlp": 0.01261749, + "balance_loss_clip": 1.13320589, + "balance_loss_mlp": 1.02466583, + "epoch": 0.643780436481693, + "flos": 24135901287840.0, + "grad_norm": 1.4411594359806263, + "language_loss": 0.73304367, + "learning_rate": 1.1897065394388487e-06, + "loss": 0.76060677, + "num_input_tokens_seen": 115222395, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.36523438, + "step": 5354, + "time_per_iteration": 2.991732120513916 + }, + { + "auxiliary_loss_clip": 0.01509194, + "auxiliary_loss_mlp": 0.01270387, + "balance_loss_clip": 1.15034032, + "balance_loss_mlp": 1.03044224, + "epoch": 0.6439006793723321, + "flos": 23151048848640.0, + "grad_norm": 1.5372976565351364, + "language_loss": 0.76762748, + "learning_rate": 1.1889944247832385e-06, + "loss": 0.79542327, + "num_input_tokens_seen": 115242635, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.39453125, + "step": 5355, + "time_per_iteration": 2.981034755706787 + }, + { + "auxiliary_loss_clip": 0.0149703, + "auxiliary_loss_mlp": 0.0127494, + "balance_loss_clip": 1.13643873, + "balance_loss_mlp": 1.03518605, + "epoch": 0.6440209222629713, + "flos": 23619883570560.0, + "grad_norm": 1.8982394661154351, + "language_loss": 0.70686877, + "learning_rate": 1.1882824331565283e-06, + "loss": 0.7345885, + "num_input_tokens_seen": 115262095, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.39257812, + "step": 5356, + "time_per_iteration": 2.9882113933563232 + }, + { + "auxiliary_loss_clip": 0.01503712, + "auxiliary_loss_mlp": 0.0126385, + "balance_loss_clip": 1.14409709, + "balance_loss_mlp": 1.02657557, + "epoch": 0.6441411651536103, + "flos": 16546742046720.0, + "grad_norm": 3.3790578144055856, + "language_loss": 0.88800991, + "learning_rate": 1.1875705646667287e-06, + "loss": 0.91568553, + "num_input_tokens_seen": 115279985, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.3671875, + "step": 5357, + "time_per_iteration": 2.8996429443359375 + }, + { + "auxiliary_loss_clip": 0.0149816, + "auxiliary_loss_mlp": 0.01266786, + "balance_loss_clip": 1.13844466, + "balance_loss_mlp": 1.02607882, + "epoch": 0.6442614080442494, + "flos": 25413334467840.0, + "grad_norm": 2.4350099258759452, + "language_loss": 0.75510025, + "learning_rate": 1.1868588194218282e-06, + "loss": 0.78274971, + "num_input_tokens_seen": 115300365, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.40234375, + "step": 5358, + "time_per_iteration": 3.056433916091919 + }, + { + "auxiliary_loss_clip": 0.01503627, + "auxiliary_loss_mlp": 0.01265698, + "balance_loss_clip": 1.14503121, + "balance_loss_mlp": 1.0265162, + "epoch": 0.6443816509348885, + "flos": 28296586116000.0, + "grad_norm": 1.7459693890172658, + "language_loss": 0.73947608, + "learning_rate": 1.1861471975297979e-06, + "loss": 0.76716936, + "num_input_tokens_seen": 115322060, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.38671875, + "step": 5359, + "time_per_iteration": 3.0993311405181885 + }, + { + "auxiliary_loss_clip": 0.01504107, + "auxiliary_loss_mlp": 0.01272949, + "balance_loss_clip": 1.14258838, + "balance_loss_mlp": 1.03014374, + "epoch": 0.6445018938255276, + "flos": 36693357683040.0, + "grad_norm": 1.8611001372716325, + "language_loss": 0.70955557, + "learning_rate": 1.185435699098591e-06, + "loss": 0.73732609, + "num_input_tokens_seen": 115348255, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.42382812, + "step": 5360, + "time_per_iteration": 3.1420516967773438 + }, + { + "auxiliary_loss_clip": 0.01503439, + "auxiliary_loss_mlp": 0.01262688, + "balance_loss_clip": 1.14371145, + "balance_loss_mlp": 1.02350664, + "epoch": 0.6446221367161666, + "flos": 14503152384000.0, + "grad_norm": 2.5898692369455927, + "language_loss": 0.78700483, + "learning_rate": 1.1847243242361403e-06, + "loss": 0.81466609, + "num_input_tokens_seen": 115366845, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.38671875, + "step": 5361, + "time_per_iteration": 3.1500184535980225 + }, + { + "auxiliary_loss_clip": 0.01499584, + "auxiliary_loss_mlp": 0.01272562, + "balance_loss_clip": 1.13829374, + "balance_loss_mlp": 1.03509748, + "epoch": 0.6447423796068057, + "flos": 24611752719360.0, + "grad_norm": 1.9279812554725342, + "language_loss": 0.77932924, + "learning_rate": 1.1840130730503624e-06, + "loss": 0.8070507, + "num_input_tokens_seen": 115388125, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.36914062, + "step": 5362, + "time_per_iteration": 2.9723362922668457 + }, + { + "auxiliary_loss_clip": 0.01503931, + "auxiliary_loss_mlp": 0.01279338, + "balance_loss_clip": 1.14330626, + "balance_loss_mlp": 1.03920245, + "epoch": 0.6448626224974449, + "flos": 25049713554720.0, + "grad_norm": 1.803786643934931, + "language_loss": 0.74661124, + "learning_rate": 1.1833019456491518e-06, + "loss": 0.77444386, + "num_input_tokens_seen": 115409655, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.39648438, + "step": 5363, + "time_per_iteration": 3.026668071746826 + }, + { + "auxiliary_loss_clip": 0.01500875, + "auxiliary_loss_mlp": 0.01267943, + "balance_loss_clip": 1.141469, + "balance_loss_mlp": 1.0266633, + "epoch": 0.6449828653880839, + "flos": 22532661934560.0, + "grad_norm": 2.4744770370116864, + "language_loss": 0.79121351, + "learning_rate": 1.1825909421403871e-06, + "loss": 0.81890166, + "num_input_tokens_seen": 115428750, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.40820312, + "step": 5364, + "time_per_iteration": 4.779074430465698 + }, + { + "auxiliary_loss_clip": 0.01498615, + "auxiliary_loss_mlp": 0.01269632, + "balance_loss_clip": 1.13963819, + "balance_loss_mlp": 1.03102303, + "epoch": 0.645103108278723, + "flos": 25697722726080.0, + "grad_norm": 1.7870027232794925, + "language_loss": 0.763762, + "learning_rate": 1.181880062631926e-06, + "loss": 0.79144442, + "num_input_tokens_seen": 115448085, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.3828125, + "step": 5365, + "time_per_iteration": 2.990831136703491 + }, + { + "auxiliary_loss_clip": 0.01505262, + "auxiliary_loss_mlp": 0.01267716, + "balance_loss_clip": 1.14416182, + "balance_loss_mlp": 1.02719915, + "epoch": 0.6452233511693621, + "flos": 27452600320320.0, + "grad_norm": 2.4073289969052207, + "language_loss": 0.85279322, + "learning_rate": 1.1811693072316093e-06, + "loss": 0.88052297, + "num_input_tokens_seen": 115465765, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 2.40039062, + "step": 5366, + "time_per_iteration": 2.9956443309783936 + }, + { + "auxiliary_loss_clip": 0.01504042, + "auxiliary_loss_mlp": 0.01267393, + "balance_loss_clip": 1.14380741, + "balance_loss_mlp": 1.03088236, + "epoch": 0.6453435940600012, + "flos": 19210652959680.0, + "grad_norm": 3.493906095927575, + "language_loss": 0.84474164, + "learning_rate": 1.1804586760472574e-06, + "loss": 0.87245601, + "num_input_tokens_seen": 115482230, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 2.359375, + "step": 5367, + "time_per_iteration": 2.954463243484497 + }, + { + "auxiliary_loss_clip": 0.01504987, + "auxiliary_loss_mlp": 0.0126408, + "balance_loss_clip": 1.14529848, + "balance_loss_mlp": 1.02699709, + "epoch": 0.6454638369506402, + "flos": 25739975060640.0, + "grad_norm": 2.778908494979248, + "language_loss": 0.8037858, + "learning_rate": 1.1797481691866736e-06, + "loss": 0.83147651, + "num_input_tokens_seen": 115499455, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.36523438, + "step": 5368, + "time_per_iteration": 2.9654150009155273 + }, + { + "auxiliary_loss_clip": 0.01505643, + "auxiliary_loss_mlp": 0.0126868, + "balance_loss_clip": 1.14611328, + "balance_loss_mlp": 1.03102422, + "epoch": 0.6455840798412794, + "flos": 20990904857280.0, + "grad_norm": 2.087252312459185, + "language_loss": 0.82701492, + "learning_rate": 1.1790377867576393e-06, + "loss": 0.85475814, + "num_input_tokens_seen": 115517205, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.37109375, + "step": 5369, + "time_per_iteration": 3.7608115673065186 + }, + { + "auxiliary_loss_clip": 0.01503896, + "auxiliary_loss_mlp": 0.01263778, + "balance_loss_clip": 1.14414907, + "balance_loss_mlp": 1.02440524, + "epoch": 0.6457043227319185, + "flos": 26069953331520.0, + "grad_norm": 1.900817313812625, + "language_loss": 0.76190031, + "learning_rate": 1.1783275288679203e-06, + "loss": 0.78957701, + "num_input_tokens_seen": 115534370, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.390625, + "step": 5370, + "time_per_iteration": 3.0030298233032227 + }, + { + "auxiliary_loss_clip": 0.01462675, + "auxiliary_loss_mlp": 0.01207352, + "balance_loss_clip": 1.11067688, + "balance_loss_mlp": 1.01242065, + "epoch": 0.6458245656225575, + "flos": 60377567777280.0, + "grad_norm": 0.8538563252191957, + "language_loss": 0.57122087, + "learning_rate": 1.177617395625262e-06, + "loss": 0.59792113, + "num_input_tokens_seen": 115592345, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 1.94921875, + "step": 5371, + "time_per_iteration": 3.3782756328582764 + }, + { + "auxiliary_loss_clip": 0.01501264, + "auxiliary_loss_mlp": 0.01265778, + "balance_loss_clip": 1.1411705, + "balance_loss_mlp": 1.02735949, + "epoch": 0.6459448085131967, + "flos": 23079060472320.0, + "grad_norm": 2.3982338237305125, + "language_loss": 0.75438011, + "learning_rate": 1.1769073871373908e-06, + "loss": 0.78205061, + "num_input_tokens_seen": 115612550, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.37890625, + "step": 5372, + "time_per_iteration": 3.0433647632598877 + }, + { + "auxiliary_loss_clip": 0.01498727, + "auxiliary_loss_mlp": 0.01264918, + "balance_loss_clip": 1.13871837, + "balance_loss_mlp": 1.02440143, + "epoch": 0.6460650514038357, + "flos": 22600212716160.0, + "grad_norm": 2.015587610781665, + "language_loss": 0.83828354, + "learning_rate": 1.176197503512015e-06, + "loss": 0.86591995, + "num_input_tokens_seen": 115632265, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.40039062, + "step": 5373, + "time_per_iteration": 3.0018815994262695 + }, + { + "auxiliary_loss_clip": 0.01513916, + "auxiliary_loss_mlp": 0.01268774, + "balance_loss_clip": 1.15440011, + "balance_loss_mlp": 1.03111887, + "epoch": 0.6461852942944748, + "flos": 20268821260800.0, + "grad_norm": 2.4868423221185267, + "language_loss": 0.82743347, + "learning_rate": 1.1754877448568223e-06, + "loss": 0.85526037, + "num_input_tokens_seen": 115651720, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.37109375, + "step": 5374, + "time_per_iteration": 3.0759775638580322 + }, + { + "auxiliary_loss_clip": 0.01507221, + "auxiliary_loss_mlp": 0.01268511, + "balance_loss_clip": 1.1482892, + "balance_loss_mlp": 1.03104627, + "epoch": 0.646305537185114, + "flos": 23369327595360.0, + "grad_norm": 2.0980515231069985, + "language_loss": 0.90070474, + "learning_rate": 1.1747781112794837e-06, + "loss": 0.92846209, + "num_input_tokens_seen": 115668215, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.36914062, + "step": 5375, + "time_per_iteration": 3.920863389968872 + }, + { + "auxiliary_loss_clip": 0.01511441, + "auxiliary_loss_mlp": 0.01260958, + "balance_loss_clip": 1.15273952, + "balance_loss_mlp": 1.0217762, + "epoch": 0.646425780075753, + "flos": 24279650471520.0, + "grad_norm": 1.6861791136661417, + "language_loss": 0.83206177, + "learning_rate": 1.1740686028876487e-06, + "loss": 0.8597858, + "num_input_tokens_seen": 115687080, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.38671875, + "step": 5376, + "time_per_iteration": 3.0782604217529297 + }, + { + "auxiliary_loss_clip": 0.0150571, + "auxiliary_loss_mlp": 0.01264568, + "balance_loss_clip": 1.14591825, + "balance_loss_mlp": 1.0253861, + "epoch": 0.6465460229663921, + "flos": 20816092146240.0, + "grad_norm": 3.341143355582927, + "language_loss": 0.75475019, + "learning_rate": 1.1733592197889507e-06, + "loss": 0.78245294, + "num_input_tokens_seen": 115703990, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.38671875, + "step": 5377, + "time_per_iteration": 3.0358307361602783 + }, + { + "auxiliary_loss_clip": 0.01501309, + "auxiliary_loss_mlp": 0.01265892, + "balance_loss_clip": 1.14144838, + "balance_loss_mlp": 1.02918971, + "epoch": 0.6466662658570312, + "flos": 22855320070560.0, + "grad_norm": 2.3019371684405754, + "language_loss": 0.7251395, + "learning_rate": 1.1726499620910014e-06, + "loss": 0.75281155, + "num_input_tokens_seen": 115724270, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.36132812, + "step": 5378, + "time_per_iteration": 3.061565637588501 + }, + { + "auxiliary_loss_clip": 0.01499111, + "auxiliary_loss_mlp": 0.0127004, + "balance_loss_clip": 1.13909209, + "balance_loss_mlp": 1.02952385, + "epoch": 0.6467865087476703, + "flos": 15306365043360.0, + "grad_norm": 2.3179221837780455, + "language_loss": 0.77831769, + "learning_rate": 1.1719408299013955e-06, + "loss": 0.80600923, + "num_input_tokens_seen": 115742995, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.40039062, + "step": 5379, + "time_per_iteration": 3.0288772583007812 + }, + { + "auxiliary_loss_clip": 0.01506562, + "auxiliary_loss_mlp": 0.01262289, + "balance_loss_clip": 1.14663696, + "balance_loss_mlp": 1.02768481, + "epoch": 0.6469067516383094, + "flos": 19575715142880.0, + "grad_norm": 3.2906298602018786, + "language_loss": 0.75710773, + "learning_rate": 1.1712318233277067e-06, + "loss": 0.78479624, + "num_input_tokens_seen": 115762015, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.34179688, + "step": 5380, + "time_per_iteration": 3.0347061157226562 + }, + { + "auxiliary_loss_clip": 0.0146154, + "auxiliary_loss_mlp": 0.01192894, + "balance_loss_clip": 1.10936534, + "balance_loss_mlp": 0.99681854, + "epoch": 0.6470269945289485, + "flos": 65104108653600.0, + "grad_norm": 0.7453457747868218, + "language_loss": 0.57794446, + "learning_rate": 1.1705229424774916e-06, + "loss": 0.60448879, + "num_input_tokens_seen": 115816285, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 1.9609375, + "step": 5381, + "time_per_iteration": 3.375495195388794 + }, + { + "auxiliary_loss_clip": 0.01496982, + "auxiliary_loss_mlp": 0.01273719, + "balance_loss_clip": 1.13776779, + "balance_loss_mlp": 1.03205836, + "epoch": 0.6471472374195876, + "flos": 30699359097120.0, + "grad_norm": 2.032365807540882, + "language_loss": 0.64405018, + "learning_rate": 1.1698141874582867e-06, + "loss": 0.67175722, + "num_input_tokens_seen": 115837330, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.41210938, + "step": 5382, + "time_per_iteration": 3.1702158451080322 + }, + { + "auxiliary_loss_clip": 0.01503102, + "auxiliary_loss_mlp": 0.01259185, + "balance_loss_clip": 1.14405465, + "balance_loss_mlp": 1.02305484, + "epoch": 0.6472674803102266, + "flos": 20523928615200.0, + "grad_norm": 2.3262938219261966, + "language_loss": 0.72264326, + "learning_rate": 1.169105558377609e-06, + "loss": 0.75026613, + "num_input_tokens_seen": 115857420, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.35546875, + "step": 5383, + "time_per_iteration": 3.110062599182129 + }, + { + "auxiliary_loss_clip": 0.01505225, + "auxiliary_loss_mlp": 0.01270191, + "balance_loss_clip": 1.1459105, + "balance_loss_mlp": 1.03406143, + "epoch": 0.6473877232008658, + "flos": 24717649235040.0, + "grad_norm": 2.226724818757171, + "language_loss": 0.78687143, + "learning_rate": 1.1683970553429587e-06, + "loss": 0.81462562, + "num_input_tokens_seen": 115878875, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.35742188, + "step": 5384, + "time_per_iteration": 3.1450347900390625 + }, + { + "auxiliary_loss_clip": 0.01504119, + "auxiliary_loss_mlp": 0.01273547, + "balance_loss_clip": 1.1439054, + "balance_loss_mlp": 1.03474653, + "epoch": 0.6475079660915048, + "flos": 15887240642880.0, + "grad_norm": 2.440637615813095, + "language_loss": 0.82058299, + "learning_rate": 1.1676886784618128e-06, + "loss": 0.84835964, + "num_input_tokens_seen": 115895540, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.3828125, + "step": 5385, + "time_per_iteration": 2.987886428833008 + }, + { + "auxiliary_loss_clip": 0.01500611, + "auxiliary_loss_mlp": 0.01266797, + "balance_loss_clip": 1.14123809, + "balance_loss_mlp": 1.02818727, + "epoch": 0.6476282089821439, + "flos": 17383369779360.0, + "grad_norm": 2.7954738502437624, + "language_loss": 0.83670199, + "learning_rate": 1.1669804278416332e-06, + "loss": 0.86437607, + "num_input_tokens_seen": 115910265, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.38085938, + "step": 5386, + "time_per_iteration": 3.030710458755493 + }, + { + "auxiliary_loss_clip": 0.01503845, + "auxiliary_loss_mlp": 0.01268258, + "balance_loss_clip": 1.14400208, + "balance_loss_mlp": 1.02907646, + "epoch": 0.6477484518727831, + "flos": 20196188105760.0, + "grad_norm": 2.252315486796655, + "language_loss": 0.71400428, + "learning_rate": 1.1662723035898602e-06, + "loss": 0.74172533, + "num_input_tokens_seen": 115930025, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.38671875, + "step": 5387, + "time_per_iteration": 3.0405633449554443 + }, + { + "auxiliary_loss_clip": 0.01503582, + "auxiliary_loss_mlp": 0.0127635, + "balance_loss_clip": 1.14346552, + "balance_loss_mlp": 1.03602374, + "epoch": 0.6478686947634221, + "flos": 25412765545440.0, + "grad_norm": 2.1049145748422013, + "language_loss": 0.82015908, + "learning_rate": 1.165564305813915e-06, + "loss": 0.84795833, + "num_input_tokens_seen": 115949025, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.3984375, + "step": 5388, + "time_per_iteration": 3.088804006576538 + }, + { + "auxiliary_loss_clip": 0.01498763, + "auxiliary_loss_mlp": 0.01261774, + "balance_loss_clip": 1.14083982, + "balance_loss_mlp": 1.0237366, + "epoch": 0.6479889376540612, + "flos": 20085626426400.0, + "grad_norm": 1.8322083866413275, + "language_loss": 0.81612611, + "learning_rate": 1.1648564346212019e-06, + "loss": 0.84373152, + "num_input_tokens_seen": 115968145, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.375, + "step": 5389, + "time_per_iteration": 3.0674164295196533 + }, + { + "auxiliary_loss_clip": 0.01498183, + "auxiliary_loss_mlp": 0.01266021, + "balance_loss_clip": 1.13894641, + "balance_loss_mlp": 1.0289371, + "epoch": 0.6481091805447003, + "flos": 26760366550080.0, + "grad_norm": 1.9768807687658332, + "language_loss": 0.76392281, + "learning_rate": 1.164148690119104e-06, + "loss": 0.79156482, + "num_input_tokens_seen": 115989425, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.36523438, + "step": 5390, + "time_per_iteration": 3.877061367034912 + }, + { + "auxiliary_loss_clip": 0.0149629, + "auxiliary_loss_mlp": 0.01268473, + "balance_loss_clip": 1.1369971, + "balance_loss_mlp": 1.02910113, + "epoch": 0.6482294234353394, + "flos": 23954413220640.0, + "grad_norm": 2.0796583000422784, + "language_loss": 0.74337912, + "learning_rate": 1.163441072414985e-06, + "loss": 0.77102673, + "num_input_tokens_seen": 116009630, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.38867188, + "step": 5391, + "time_per_iteration": 3.9226887226104736 + }, + { + "auxiliary_loss_clip": 0.0149637, + "auxiliary_loss_mlp": 0.01271705, + "balance_loss_clip": 1.13693726, + "balance_loss_mlp": 1.03309596, + "epoch": 0.6483496663259785, + "flos": 26212033676160.0, + "grad_norm": 3.1384396663235323, + "language_loss": 0.69916624, + "learning_rate": 1.16273358161619e-06, + "loss": 0.72684699, + "num_input_tokens_seen": 116029965, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.38085938, + "step": 5392, + "time_per_iteration": 3.138683795928955 + }, + { + "auxiliary_loss_clip": 0.01502716, + "auxiliary_loss_mlp": 0.01276365, + "balance_loss_clip": 1.14244986, + "balance_loss_mlp": 1.03508568, + "epoch": 0.6484699092166175, + "flos": 20924302279680.0, + "grad_norm": 2.020649760285921, + "language_loss": 0.83456135, + "learning_rate": 1.1620262178300446e-06, + "loss": 0.86235219, + "num_input_tokens_seen": 116048580, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 2.40820312, + "step": 5393, + "time_per_iteration": 3.138218402862549 + }, + { + "auxiliary_loss_clip": 0.01501044, + "auxiliary_loss_mlp": 0.01264827, + "balance_loss_clip": 1.14194143, + "balance_loss_mlp": 1.02793384, + "epoch": 0.6485901521072567, + "flos": 33075657858240.0, + "grad_norm": 4.121709827961587, + "language_loss": 0.75785685, + "learning_rate": 1.1613189811638563e-06, + "loss": 0.78551555, + "num_input_tokens_seen": 116070305, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.36328125, + "step": 5394, + "time_per_iteration": 3.1910221576690674 + }, + { + "auxiliary_loss_clip": 0.01499345, + "auxiliary_loss_mlp": 0.01268176, + "balance_loss_clip": 1.13854873, + "balance_loss_mlp": 1.03319049, + "epoch": 0.6487103949978957, + "flos": 22276302950880.0, + "grad_norm": 1.7988511341903068, + "language_loss": 0.78172982, + "learning_rate": 1.1606118717249117e-06, + "loss": 0.80940509, + "num_input_tokens_seen": 116090405, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.34375, + "step": 5395, + "time_per_iteration": 3.1111278533935547 + }, + { + "auxiliary_loss_clip": 0.01490345, + "auxiliary_loss_mlp": 0.01262941, + "balance_loss_clip": 1.13076413, + "balance_loss_mlp": 1.02051663, + "epoch": 0.6488306378885348, + "flos": 22932656317440.0, + "grad_norm": 2.317386419408094, + "language_loss": 0.67906725, + "learning_rate": 1.1599048896204787e-06, + "loss": 0.70660013, + "num_input_tokens_seen": 116110285, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.41992188, + "step": 5396, + "time_per_iteration": 3.885094404220581 + }, + { + "auxiliary_loss_clip": 0.01491438, + "auxiliary_loss_mlp": 0.01267139, + "balance_loss_clip": 1.13197839, + "balance_loss_mlp": 1.03119993, + "epoch": 0.648950880779174, + "flos": 20378207167200.0, + "grad_norm": 2.3758962378957005, + "language_loss": 0.80988652, + "learning_rate": 1.1591980349578061e-06, + "loss": 0.83747232, + "num_input_tokens_seen": 116128955, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.35351562, + "step": 5397, + "time_per_iteration": 3.0354278087615967 + }, + { + "auxiliary_loss_clip": 0.01463186, + "auxiliary_loss_mlp": 0.0121357, + "balance_loss_clip": 1.11104763, + "balance_loss_mlp": 1.01787567, + "epoch": 0.649071123669813, + "flos": 59937482964960.0, + "grad_norm": 0.7645170772027846, + "language_loss": 0.54234147, + "learning_rate": 1.158491307844123e-06, + "loss": 0.56910902, + "num_input_tokens_seen": 116188875, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 1.953125, + "step": 5398, + "time_per_iteration": 3.442112445831299 + }, + { + "auxiliary_loss_clip": 0.01502526, + "auxiliary_loss_mlp": 0.01267675, + "balance_loss_clip": 1.14389992, + "balance_loss_mlp": 1.0281117, + "epoch": 0.6491913665604521, + "flos": 20448526704480.0, + "grad_norm": 2.0414655662598538, + "language_loss": 0.83803761, + "learning_rate": 1.1577847083866387e-06, + "loss": 0.86573958, + "num_input_tokens_seen": 116207910, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.390625, + "step": 5399, + "time_per_iteration": 3.0126378536224365 + }, + { + "auxiliary_loss_clip": 0.01494074, + "auxiliary_loss_mlp": 0.0127164, + "balance_loss_clip": 1.13208413, + "balance_loss_mlp": 1.03379405, + "epoch": 0.6493116094510912, + "flos": 16948746622080.0, + "grad_norm": 5.513435768369209, + "language_loss": 0.71978593, + "learning_rate": 1.1570782366925453e-06, + "loss": 0.74744308, + "num_input_tokens_seen": 116226425, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 2.375, + "step": 5400, + "time_per_iteration": 3.0358476638793945 + }, + { + "auxiliary_loss_clip": 0.01496322, + "auxiliary_loss_mlp": 0.01267856, + "balance_loss_clip": 1.13636458, + "balance_loss_mlp": 1.0294373, + "epoch": 0.6494318523417303, + "flos": 18804703855680.0, + "grad_norm": 2.180584627821655, + "language_loss": 0.75295866, + "learning_rate": 1.1563718928690132e-06, + "loss": 0.78060043, + "num_input_tokens_seen": 116243860, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 2.37890625, + "step": 5401, + "time_per_iteration": 3.123119354248047 + }, + { + "auxiliary_loss_clip": 0.01494998, + "auxiliary_loss_mlp": 0.01279612, + "balance_loss_clip": 1.1336689, + "balance_loss_mlp": 1.04252851, + "epoch": 0.6495520952323693, + "flos": 18984485155680.0, + "grad_norm": 2.4933424741459826, + "language_loss": 0.71562237, + "learning_rate": 1.1556656770231942e-06, + "loss": 0.74336851, + "num_input_tokens_seen": 116260055, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 2.36523438, + "step": 5402, + "time_per_iteration": 3.028393507003784 + }, + { + "auxiliary_loss_clip": 0.01494633, + "auxiliary_loss_mlp": 0.01264094, + "balance_loss_clip": 1.13530815, + "balance_loss_mlp": 1.02682006, + "epoch": 0.6496723381230085, + "flos": 22747489218720.0, + "grad_norm": 1.6168998933843715, + "language_loss": 0.76229954, + "learning_rate": 1.1549595892622207e-06, + "loss": 0.78988683, + "num_input_tokens_seen": 116278825, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 2.3671875, + "step": 5403, + "time_per_iteration": 3.932373046875 + }, + { + "auxiliary_loss_clip": 0.01459422, + "auxiliary_loss_mlp": 0.01213425, + "balance_loss_clip": 1.10767138, + "balance_loss_mlp": 1.0165863, + "epoch": 0.6497925810136476, + "flos": 62151371887680.0, + "grad_norm": 0.8281197397056456, + "language_loss": 0.58934891, + "learning_rate": 1.1542536296932047e-06, + "loss": 0.6160773, + "num_input_tokens_seen": 116342360, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 1.96484375, + "step": 5404, + "time_per_iteration": 3.4680755138397217 + }, + { + "auxiliary_loss_clip": 0.01496308, + "auxiliary_loss_mlp": 0.01269798, + "balance_loss_clip": 1.13923812, + "balance_loss_mlp": 1.02794647, + "epoch": 0.6499128239042866, + "flos": 20158790575680.0, + "grad_norm": 2.4366855478603884, + "language_loss": 0.69966912, + "learning_rate": 1.1535477984232414e-06, + "loss": 0.72733021, + "num_input_tokens_seen": 116362235, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.4140625, + "step": 5405, + "time_per_iteration": 3.1259100437164307 + }, + { + "auxiliary_loss_clip": 0.01492372, + "auxiliary_loss_mlp": 0.01261028, + "balance_loss_clip": 1.13298535, + "balance_loss_mlp": 1.02356339, + "epoch": 0.6500330667949258, + "flos": 24464741713920.0, + "grad_norm": 2.817068182269587, + "language_loss": 0.77333903, + "learning_rate": 1.152842095559404e-06, + "loss": 0.80087304, + "num_input_tokens_seen": 116382895, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.36914062, + "step": 5406, + "time_per_iteration": 3.0896027088165283 + }, + { + "auxiliary_loss_clip": 0.01494269, + "auxiliary_loss_mlp": 0.01256465, + "balance_loss_clip": 1.13616037, + "balance_loss_mlp": 1.01995349, + "epoch": 0.6501533096855648, + "flos": 25479406051200.0, + "grad_norm": 1.8502125425798468, + "language_loss": 0.77097541, + "learning_rate": 1.1521365212087474e-06, + "loss": 0.79848278, + "num_input_tokens_seen": 116402880, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.359375, + "step": 5407, + "time_per_iteration": 3.0120794773101807 + }, + { + "auxiliary_loss_clip": 0.01492667, + "auxiliary_loss_mlp": 0.01262991, + "balance_loss_clip": 1.13335609, + "balance_loss_mlp": 1.02171135, + "epoch": 0.6502735525762039, + "flos": 44822581459200.0, + "grad_norm": 1.8006567386307009, + "language_loss": 0.70815337, + "learning_rate": 1.1514310754783062e-06, + "loss": 0.73570997, + "num_input_tokens_seen": 116425830, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.40820312, + "step": 5408, + "time_per_iteration": 3.133131980895996 + }, + { + "auxiliary_loss_clip": 0.01492235, + "auxiliary_loss_mlp": 0.0126888, + "balance_loss_clip": 1.1328516, + "balance_loss_mlp": 1.03179646, + "epoch": 0.6503937954668431, + "flos": 28661117304960.0, + "grad_norm": 2.4876394511609674, + "language_loss": 0.73733807, + "learning_rate": 1.1507257584750964e-06, + "loss": 0.7649492, + "num_input_tokens_seen": 116446010, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 2.36523438, + "step": 5409, + "time_per_iteration": 3.0145602226257324 + }, + { + "auxiliary_loss_clip": 0.01496248, + "auxiliary_loss_mlp": 0.01285212, + "balance_loss_clip": 1.13718796, + "balance_loss_mlp": 1.04221594, + "epoch": 0.6505140383574821, + "flos": 20924643633120.0, + "grad_norm": 1.9416141771933924, + "language_loss": 0.77663255, + "learning_rate": 1.150020570306113e-06, + "loss": 0.80444711, + "num_input_tokens_seen": 116465150, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.42578125, + "step": 5410, + "time_per_iteration": 2.9080657958984375 + }, + { + "auxiliary_loss_clip": 0.01492571, + "auxiliary_loss_mlp": 0.01260748, + "balance_loss_clip": 1.13315237, + "balance_loss_mlp": 1.01927722, + "epoch": 0.6506342812481212, + "flos": 20597358261600.0, + "grad_norm": 1.9620355834097003, + "language_loss": 0.75000882, + "learning_rate": 1.1493155110783338e-06, + "loss": 0.777542, + "num_input_tokens_seen": 116483675, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 2.41015625, + "step": 5411, + "time_per_iteration": 2.9727368354797363 + }, + { + "auxiliary_loss_clip": 0.01498562, + "auxiliary_loss_mlp": 0.01271835, + "balance_loss_clip": 1.13984406, + "balance_loss_mlp": 1.0339886, + "epoch": 0.6507545241387603, + "flos": 30229083105120.0, + "grad_norm": 3.0829677443126906, + "language_loss": 0.70428979, + "learning_rate": 1.1486105808987155e-06, + "loss": 0.73199373, + "num_input_tokens_seen": 116505165, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.37304688, + "step": 5412, + "time_per_iteration": 3.1462900638580322 + }, + { + "auxiliary_loss_clip": 0.01491264, + "auxiliary_loss_mlp": 0.01273937, + "balance_loss_clip": 1.1304574, + "balance_loss_mlp": 1.03551829, + "epoch": 0.6508747670293994, + "flos": 17130120904800.0, + "grad_norm": 2.044942547960559, + "language_loss": 0.81958348, + "learning_rate": 1.1479057798741947e-06, + "loss": 0.84723544, + "num_input_tokens_seen": 116523220, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 2.37890625, + "step": 5413, + "time_per_iteration": 2.980292797088623 + }, + { + "auxiliary_loss_clip": 0.01452226, + "auxiliary_loss_mlp": 0.01205742, + "balance_loss_clip": 1.10320258, + "balance_loss_mlp": 1.00814056, + "epoch": 0.6509950099200384, + "flos": 68565884355360.0, + "grad_norm": 0.7873054502079327, + "language_loss": 0.53299695, + "learning_rate": 1.14720110811169e-06, + "loss": 0.55957663, + "num_input_tokens_seen": 116580450, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 1.96875, + "step": 5414, + "time_per_iteration": 3.441032648086548 + }, + { + "auxiliary_loss_clip": 0.01498198, + "auxiliary_loss_mlp": 0.01274513, + "balance_loss_clip": 1.14096928, + "balance_loss_mlp": 1.03514099, + "epoch": 0.6511152528106776, + "flos": 22349732597280.0, + "grad_norm": 3.4982182098889463, + "language_loss": 0.7687692, + "learning_rate": 1.146496565718098e-06, + "loss": 0.79649633, + "num_input_tokens_seen": 116601020, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.38867188, + "step": 5415, + "time_per_iteration": 3.0090863704681396 + }, + { + "auxiliary_loss_clip": 0.01491917, + "auxiliary_loss_mlp": 0.01267886, + "balance_loss_clip": 1.13337564, + "balance_loss_mlp": 1.02679718, + "epoch": 0.6512354957013167, + "flos": 20524269968640.0, + "grad_norm": 7.847581398882013, + "language_loss": 0.75825542, + "learning_rate": 1.1457921528002996e-06, + "loss": 0.78585339, + "num_input_tokens_seen": 116619455, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.40625, + "step": 5416, + "time_per_iteration": 3.021420478820801 + }, + { + "auxiliary_loss_clip": 0.01490425, + "auxiliary_loss_mlp": 0.01267955, + "balance_loss_clip": 1.1327039, + "balance_loss_mlp": 1.03049016, + "epoch": 0.6513557385919557, + "flos": 32339768411520.0, + "grad_norm": 2.295432707768672, + "language_loss": 0.72422278, + "learning_rate": 1.1450878694651522e-06, + "loss": 0.75180656, + "num_input_tokens_seen": 116640020, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.36914062, + "step": 5417, + "time_per_iteration": 3.9616806507110596 + }, + { + "auxiliary_loss_clip": 0.01489172, + "auxiliary_loss_mlp": 0.01272257, + "balance_loss_clip": 1.13165414, + "balance_loss_mlp": 1.03460157, + "epoch": 0.6514759814825949, + "flos": 12094803963360.0, + "grad_norm": 2.6177123976887624, + "language_loss": 0.63655758, + "learning_rate": 1.1443837158194954e-06, + "loss": 0.66417193, + "num_input_tokens_seen": 116655165, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.37109375, + "step": 5418, + "time_per_iteration": 3.0196115970611572 + }, + { + "auxiliary_loss_clip": 0.01487962, + "auxiliary_loss_mlp": 0.01274298, + "balance_loss_clip": 1.13016605, + "balance_loss_mlp": 1.0334003, + "epoch": 0.651596224373234, + "flos": 22529020831200.0, + "grad_norm": 1.671394862329695, + "language_loss": 0.74650657, + "learning_rate": 1.1436796919701484e-06, + "loss": 0.77412915, + "num_input_tokens_seen": 116673880, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.40429688, + "step": 5419, + "time_per_iteration": 3.892137050628662 + }, + { + "auxiliary_loss_clip": 0.01485615, + "auxiliary_loss_mlp": 0.01266752, + "balance_loss_clip": 1.12695718, + "balance_loss_mlp": 1.02699816, + "epoch": 0.651716467263873, + "flos": 27821455319520.0, + "grad_norm": 3.10573124979513, + "language_loss": 0.61941707, + "learning_rate": 1.1429757980239115e-06, + "loss": 0.64694071, + "num_input_tokens_seen": 116694305, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.39257812, + "step": 5420, + "time_per_iteration": 3.0455832481384277 + }, + { + "auxiliary_loss_clip": 0.0148762, + "auxiliary_loss_mlp": 0.01277783, + "balance_loss_clip": 1.12929988, + "balance_loss_mlp": 1.03535926, + "epoch": 0.6518367101545122, + "flos": 24318603056160.0, + "grad_norm": 2.430605967502616, + "language_loss": 0.81926966, + "learning_rate": 1.1422720340875636e-06, + "loss": 0.84692359, + "num_input_tokens_seen": 116713055, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.41992188, + "step": 5421, + "time_per_iteration": 3.05352783203125 + }, + { + "auxiliary_loss_clip": 0.01491756, + "auxiliary_loss_mlp": 0.01278695, + "balance_loss_clip": 1.13322783, + "balance_loss_mlp": 1.04008603, + "epoch": 0.6519569530451512, + "flos": 20014055259840.0, + "grad_norm": 2.3283304854898232, + "language_loss": 0.79533762, + "learning_rate": 1.1415684002678671e-06, + "loss": 0.82304215, + "num_input_tokens_seen": 116731815, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.38085938, + "step": 5422, + "time_per_iteration": 3.135289192199707 + }, + { + "auxiliary_loss_clip": 0.01493348, + "auxiliary_loss_mlp": 0.01276188, + "balance_loss_clip": 1.13599753, + "balance_loss_mlp": 1.03700686, + "epoch": 0.6520771959357903, + "flos": 21578000675040.0, + "grad_norm": 2.5319333739890983, + "language_loss": 0.77639091, + "learning_rate": 1.1408648966715617e-06, + "loss": 0.80408627, + "num_input_tokens_seen": 116749335, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.38671875, + "step": 5423, + "time_per_iteration": 3.913978099822998 + }, + { + "auxiliary_loss_clip": 0.01488555, + "auxiliary_loss_mlp": 0.0127114, + "balance_loss_clip": 1.13082683, + "balance_loss_mlp": 1.03043246, + "epoch": 0.6521974388264293, + "flos": 22713239725920.0, + "grad_norm": 2.8589714796807417, + "language_loss": 0.72398937, + "learning_rate": 1.1401615234053683e-06, + "loss": 0.75158632, + "num_input_tokens_seen": 116768155, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.40234375, + "step": 5424, + "time_per_iteration": 3.1607460975646973 + }, + { + "auxiliary_loss_clip": 0.0148431, + "auxiliary_loss_mlp": 0.01278004, + "balance_loss_clip": 1.12573218, + "balance_loss_mlp": 1.03748786, + "epoch": 0.6523176817170685, + "flos": 23005175688000.0, + "grad_norm": 10.465417802963985, + "language_loss": 0.75788498, + "learning_rate": 1.1394582805759885e-06, + "loss": 0.78550816, + "num_input_tokens_seen": 116787435, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.40039062, + "step": 5425, + "time_per_iteration": 3.000171661376953 + }, + { + "auxiliary_loss_clip": 0.01491874, + "auxiliary_loss_mlp": 0.01269141, + "balance_loss_clip": 1.13394928, + "balance_loss_mlp": 1.02728963, + "epoch": 0.6524379246077076, + "flos": 21690344977920.0, + "grad_norm": 1.7283056274076238, + "language_loss": 0.75717187, + "learning_rate": 1.1387551682901022e-06, + "loss": 0.78478205, + "num_input_tokens_seen": 116808040, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.4140625, + "step": 5426, + "time_per_iteration": 3.1101787090301514 + }, + { + "auxiliary_loss_clip": 0.01489582, + "auxiliary_loss_mlp": 0.01261076, + "balance_loss_clip": 1.1321063, + "balance_loss_mlp": 1.02361083, + "epoch": 0.6525581674983466, + "flos": 19392975446400.0, + "grad_norm": 2.5633027084267455, + "language_loss": 0.70663249, + "learning_rate": 1.138052186654373e-06, + "loss": 0.73413903, + "num_input_tokens_seen": 116825510, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.36914062, + "step": 5427, + "time_per_iteration": 2.9512429237365723 + }, + { + "auxiliary_loss_clip": 0.0149317, + "auxiliary_loss_mlp": 0.01273859, + "balance_loss_clip": 1.13527596, + "balance_loss_mlp": 1.03067207, + "epoch": 0.6526784103889858, + "flos": 17167366722240.0, + "grad_norm": 2.197137298007476, + "language_loss": 0.88095027, + "learning_rate": 1.1373493357754417e-06, + "loss": 0.90862054, + "num_input_tokens_seen": 116844415, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.42773438, + "step": 5428, + "time_per_iteration": 3.052541732788086 + }, + { + "auxiliary_loss_clip": 0.01494405, + "auxiliary_loss_mlp": 0.0125061, + "balance_loss_clip": 1.13852727, + "balance_loss_mlp": 1.01333547, + "epoch": 0.6527986532796248, + "flos": 18991160511840.0, + "grad_norm": 2.0108089935181868, + "language_loss": 0.77699137, + "learning_rate": 1.1366466157599303e-06, + "loss": 0.80444157, + "num_input_tokens_seen": 116863690, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.36914062, + "step": 5429, + "time_per_iteration": 3.036489963531494 + }, + { + "auxiliary_loss_clip": 0.01496496, + "auxiliary_loss_mlp": 0.01265468, + "balance_loss_clip": 1.13820922, + "balance_loss_mlp": 1.0270493, + "epoch": 0.6529188961702639, + "flos": 14239890475200.0, + "grad_norm": 2.638789977690195, + "language_loss": 0.76383215, + "learning_rate": 1.1359440267144412e-06, + "loss": 0.79145175, + "num_input_tokens_seen": 116881145, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.37890625, + "step": 5430, + "time_per_iteration": 3.706735372543335 + }, + { + "auxiliary_loss_clip": 0.01497236, + "auxiliary_loss_mlp": 0.01261531, + "balance_loss_clip": 1.13984454, + "balance_loss_mlp": 1.02349436, + "epoch": 0.653039139060903, + "flos": 36685809979200.0, + "grad_norm": 1.7853118838597284, + "language_loss": 0.74049079, + "learning_rate": 1.1352415687455556e-06, + "loss": 0.76807851, + "num_input_tokens_seen": 116902405, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.375, + "step": 5431, + "time_per_iteration": 3.1179630756378174 + }, + { + "auxiliary_loss_clip": 0.01500384, + "auxiliary_loss_mlp": 0.01267917, + "balance_loss_clip": 1.14136958, + "balance_loss_mlp": 1.02911639, + "epoch": 0.6531593819515421, + "flos": 25378743621600.0, + "grad_norm": 2.3426997629921695, + "language_loss": 0.64146084, + "learning_rate": 1.1345392419598362e-06, + "loss": 0.66914386, + "num_input_tokens_seen": 116921285, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 2.3828125, + "step": 5432, + "time_per_iteration": 3.0051257610321045 + }, + { + "auxiliary_loss_clip": 0.0149794, + "auxiliary_loss_mlp": 0.01263419, + "balance_loss_clip": 1.14167428, + "balance_loss_mlp": 1.02442777, + "epoch": 0.6532796248421812, + "flos": 21180661263360.0, + "grad_norm": 1.910727045638089, + "language_loss": 0.72249103, + "learning_rate": 1.1338370464638263e-06, + "loss": 0.75010467, + "num_input_tokens_seen": 116940685, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.38476562, + "step": 5433, + "time_per_iteration": 2.9706199169158936 + }, + { + "auxiliary_loss_clip": 0.01494015, + "auxiliary_loss_mlp": 0.01270599, + "balance_loss_clip": 1.13609171, + "balance_loss_mlp": 1.03084493, + "epoch": 0.6533998677328203, + "flos": 17677884856320.0, + "grad_norm": 2.377101151195593, + "language_loss": 0.63872027, + "learning_rate": 1.1331349823640474e-06, + "loss": 0.66636646, + "num_input_tokens_seen": 116958115, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.39257812, + "step": 5434, + "time_per_iteration": 3.0441627502441406 + }, + { + "auxiliary_loss_clip": 0.01501008, + "auxiliary_loss_mlp": 0.0125466, + "balance_loss_clip": 1.14368749, + "balance_loss_mlp": 1.02081954, + "epoch": 0.6535201106234594, + "flos": 28402786056960.0, + "grad_norm": 2.438137589853645, + "language_loss": 0.77919769, + "learning_rate": 1.132433049767003e-06, + "loss": 0.80675435, + "num_input_tokens_seen": 116976030, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.33398438, + "step": 5435, + "time_per_iteration": 2.9815673828125 + }, + { + "auxiliary_loss_clip": 0.01496885, + "auxiliary_loss_mlp": 0.01268022, + "balance_loss_clip": 1.13860941, + "balance_loss_mlp": 1.03036654, + "epoch": 0.6536403535140984, + "flos": 23588933827680.0, + "grad_norm": 1.7181233985828828, + "language_loss": 0.81009859, + "learning_rate": 1.1317312487791748e-06, + "loss": 0.83774769, + "num_input_tokens_seen": 116997680, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.37109375, + "step": 5436, + "time_per_iteration": 3.075784683227539 + }, + { + "auxiliary_loss_clip": 0.01498082, + "auxiliary_loss_mlp": 0.01263598, + "balance_loss_clip": 1.14189911, + "balance_loss_mlp": 1.0263238, + "epoch": 0.6537605964047376, + "flos": 21581603850240.0, + "grad_norm": 2.519908918556402, + "language_loss": 0.73138136, + "learning_rate": 1.1310295795070253e-06, + "loss": 0.75899816, + "num_input_tokens_seen": 117017620, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3671875, + "step": 5437, + "time_per_iteration": 3.03023624420166 + }, + { + "auxiliary_loss_clip": 0.01500702, + "auxiliary_loss_mlp": 0.01268601, + "balance_loss_clip": 1.14346457, + "balance_loss_mlp": 1.03037333, + "epoch": 0.6538808392953767, + "flos": 26836451167680.0, + "grad_norm": 2.6709428420811947, + "language_loss": 0.81162673, + "learning_rate": 1.1303280420569982e-06, + "loss": 0.83931971, + "num_input_tokens_seen": 117039505, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.37695312, + "step": 5438, + "time_per_iteration": 3.0760278701782227 + }, + { + "auxiliary_loss_clip": 0.01497643, + "auxiliary_loss_mlp": 0.0127283, + "balance_loss_clip": 1.13879609, + "balance_loss_mlp": 1.03383911, + "epoch": 0.6540010821860157, + "flos": 30740928724800.0, + "grad_norm": 1.952228406457315, + "language_loss": 0.77306986, + "learning_rate": 1.1296266365355158e-06, + "loss": 0.80077457, + "num_input_tokens_seen": 117062890, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 2.38476562, + "step": 5439, + "time_per_iteration": 3.1973161697387695 + }, + { + "auxiliary_loss_clip": 0.01498809, + "auxiliary_loss_mlp": 0.01270567, + "balance_loss_clip": 1.14167786, + "balance_loss_mlp": 1.03062248, + "epoch": 0.6541213250766549, + "flos": 26909615316960.0, + "grad_norm": 2.3145242585457964, + "language_loss": 0.73973465, + "learning_rate": 1.1289253630489806e-06, + "loss": 0.76742846, + "num_input_tokens_seen": 117083940, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.39453125, + "step": 5440, + "time_per_iteration": 3.1333343982696533 + }, + { + "auxiliary_loss_clip": 0.01500586, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 1.14321077, + "balance_loss_mlp": 1.02715647, + "epoch": 0.6542415679672939, + "flos": 19174431202560.0, + "grad_norm": 2.5911364915583603, + "language_loss": 0.72439361, + "learning_rate": 1.1282242217037753e-06, + "loss": 0.75208187, + "num_input_tokens_seen": 117101440, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.40625, + "step": 5441, + "time_per_iteration": 3.090348720550537 + }, + { + "auxiliary_loss_clip": 0.01496615, + "auxiliary_loss_mlp": 0.01274379, + "balance_loss_clip": 1.13944662, + "balance_loss_mlp": 1.03386235, + "epoch": 0.654361810857933, + "flos": 48176905590720.0, + "grad_norm": 2.1244774262819566, + "language_loss": 0.61906379, + "learning_rate": 1.127523212606262e-06, + "loss": 0.64677376, + "num_input_tokens_seen": 117124265, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.40039062, + "step": 5442, + "time_per_iteration": 3.2549326419830322 + }, + { + "auxiliary_loss_clip": 0.01494993, + "auxiliary_loss_mlp": 0.01263957, + "balance_loss_clip": 1.13777804, + "balance_loss_mlp": 1.02591968, + "epoch": 0.6544820537485722, + "flos": 26945609505120.0, + "grad_norm": 1.586151394176629, + "language_loss": 0.73181754, + "learning_rate": 1.1268223358627835e-06, + "loss": 0.75940704, + "num_input_tokens_seen": 117146755, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.375, + "step": 5443, + "time_per_iteration": 3.180034875869751 + }, + { + "auxiliary_loss_clip": 0.01496628, + "auxiliary_loss_mlp": 0.01266819, + "balance_loss_clip": 1.13880086, + "balance_loss_mlp": 1.02821016, + "epoch": 0.6546022966392112, + "flos": 20888421876000.0, + "grad_norm": 1.7977734446133085, + "language_loss": 0.71876633, + "learning_rate": 1.126121591579663e-06, + "loss": 0.74640077, + "num_input_tokens_seen": 117165960, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.38085938, + "step": 5444, + "time_per_iteration": 3.092484951019287 + }, + { + "auxiliary_loss_clip": 0.0149652, + "auxiliary_loss_mlp": 0.01269999, + "balance_loss_clip": 1.1384995, + "balance_loss_mlp": 1.035586, + "epoch": 0.6547225395298503, + "flos": 24939038090880.0, + "grad_norm": 1.7587435690383326, + "language_loss": 0.6859675, + "learning_rate": 1.1254209798632018e-06, + "loss": 0.71363264, + "num_input_tokens_seen": 117186980, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.33984375, + "step": 5445, + "time_per_iteration": 3.933363199234009 + }, + { + "auxiliary_loss_clip": 0.01499279, + "auxiliary_loss_mlp": 0.01273152, + "balance_loss_clip": 1.14159966, + "balance_loss_mlp": 1.03435206, + "epoch": 0.6548427824204894, + "flos": 22568200984800.0, + "grad_norm": 4.360739130882172, + "language_loss": 0.84725004, + "learning_rate": 1.124720500819683e-06, + "loss": 0.87497437, + "num_input_tokens_seen": 117205135, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.3828125, + "step": 5446, + "time_per_iteration": 3.055725574493408 + }, + { + "auxiliary_loss_clip": 0.01494222, + "auxiliary_loss_mlp": 0.01265926, + "balance_loss_clip": 1.13765883, + "balance_loss_mlp": 1.02636361, + "epoch": 0.6549630253111285, + "flos": 18444496476960.0, + "grad_norm": 2.5441408702993487, + "language_loss": 0.82518512, + "learning_rate": 1.1240201545553682e-06, + "loss": 0.85278654, + "num_input_tokens_seen": 117222935, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.390625, + "step": 5447, + "time_per_iteration": 3.895005941390991 + }, + { + "auxiliary_loss_clip": 0.01495461, + "auxiliary_loss_mlp": 0.01259756, + "balance_loss_clip": 1.13798666, + "balance_loss_mlp": 1.02057421, + "epoch": 0.6550832682017675, + "flos": 25189404425280.0, + "grad_norm": 1.9640889772180197, + "language_loss": 0.73378873, + "learning_rate": 1.1233199411764987e-06, + "loss": 0.76134086, + "num_input_tokens_seen": 117242370, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.38671875, + "step": 5448, + "time_per_iteration": 3.046396255493164 + }, + { + "auxiliary_loss_clip": 0.01494607, + "auxiliary_loss_mlp": 0.01270703, + "balance_loss_clip": 1.13735509, + "balance_loss_mlp": 1.03247523, + "epoch": 0.6552035110924067, + "flos": 22750409687040.0, + "grad_norm": 2.086537761087643, + "language_loss": 0.6911248, + "learning_rate": 1.1226198607892978e-06, + "loss": 0.71877789, + "num_input_tokens_seen": 117262930, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.37695312, + "step": 5449, + "time_per_iteration": 3.0089054107666016 + }, + { + "auxiliary_loss_clip": 0.01498571, + "auxiliary_loss_mlp": 0.01272643, + "balance_loss_clip": 1.1423986, + "balance_loss_mlp": 1.03269839, + "epoch": 0.6553237539830458, + "flos": 21801361795200.0, + "grad_norm": 2.209392852933334, + "language_loss": 0.80211544, + "learning_rate": 1.1219199134999664e-06, + "loss": 0.82982755, + "num_input_tokens_seen": 117281430, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.39453125, + "step": 5450, + "time_per_iteration": 3.0849626064300537 + }, + { + "auxiliary_loss_clip": 0.01496997, + "auxiliary_loss_mlp": 0.01277228, + "balance_loss_clip": 1.13938808, + "balance_loss_mlp": 1.03823709, + "epoch": 0.6554439968736848, + "flos": 20889370080000.0, + "grad_norm": 2.0996478041226365, + "language_loss": 0.78872943, + "learning_rate": 1.1212200994146863e-06, + "loss": 0.81647164, + "num_input_tokens_seen": 117299185, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.38476562, + "step": 5451, + "time_per_iteration": 3.8932788372039795 + }, + { + "auxiliary_loss_clip": 0.01485982, + "auxiliary_loss_mlp": 0.01268164, + "balance_loss_clip": 1.12860537, + "balance_loss_mlp": 1.03069878, + "epoch": 0.655564239764324, + "flos": 16140641230080.0, + "grad_norm": 1.9678201494923322, + "language_loss": 0.75754237, + "learning_rate": 1.120520418639618e-06, + "loss": 0.78508389, + "num_input_tokens_seen": 117317720, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.37109375, + "step": 5452, + "time_per_iteration": 2.945509672164917 + }, + { + "auxiliary_loss_clip": 0.01494461, + "auxiliary_loss_mlp": 0.0126582, + "balance_loss_clip": 1.1371851, + "balance_loss_mlp": 1.02835548, + "epoch": 0.655684482654963, + "flos": 29572957307520.0, + "grad_norm": 2.089375201718091, + "language_loss": 0.83314157, + "learning_rate": 1.119820871280903e-06, + "loss": 0.86074436, + "num_input_tokens_seen": 117338795, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.36914062, + "step": 5453, + "time_per_iteration": 3.030550241470337 + }, + { + "auxiliary_loss_clip": 0.01494826, + "auxiliary_loss_mlp": 0.01270535, + "balance_loss_clip": 1.13731253, + "balance_loss_mlp": 1.03059101, + "epoch": 0.6558047255456021, + "flos": 29792032545600.0, + "grad_norm": 6.502931739751964, + "language_loss": 0.73852426, + "learning_rate": 1.1191214574446614e-06, + "loss": 0.76617789, + "num_input_tokens_seen": 117359040, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.39453125, + "step": 5454, + "time_per_iteration": 3.018524408340454 + }, + { + "auxiliary_loss_clip": 0.01490524, + "auxiliary_loss_mlp": 0.01277068, + "balance_loss_clip": 1.13236654, + "balance_loss_mlp": 1.03712392, + "epoch": 0.6559249684362413, + "flos": 29061718538400.0, + "grad_norm": 1.7110329251570644, + "language_loss": 0.79718482, + "learning_rate": 1.118422177236995e-06, + "loss": 0.82486081, + "num_input_tokens_seen": 117380865, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.39453125, + "step": 5455, + "time_per_iteration": 2.97109317779541 + }, + { + "auxiliary_loss_clip": 0.01487878, + "auxiliary_loss_mlp": 0.01271813, + "balance_loss_clip": 1.13016582, + "balance_loss_mlp": 1.03072417, + "epoch": 0.6560452113268803, + "flos": 20227289561280.0, + "grad_norm": 3.9047508989884805, + "language_loss": 0.85614586, + "learning_rate": 1.1177230307639835e-06, + "loss": 0.88374275, + "num_input_tokens_seen": 117398405, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.40625, + "step": 5456, + "time_per_iteration": 2.988875389099121 + }, + { + "auxiliary_loss_clip": 0.01484812, + "auxiliary_loss_mlp": 0.01269201, + "balance_loss_clip": 1.12686992, + "balance_loss_mlp": 1.02982903, + "epoch": 0.6561654542175194, + "flos": 25047855074880.0, + "grad_norm": 1.7877380549269422, + "language_loss": 0.78725171, + "learning_rate": 1.1170240181316865e-06, + "loss": 0.81479186, + "num_input_tokens_seen": 117419850, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.38867188, + "step": 5457, + "time_per_iteration": 2.95965576171875 + }, + { + "auxiliary_loss_clip": 0.01487799, + "auxiliary_loss_mlp": 0.01268945, + "balance_loss_clip": 1.12961149, + "balance_loss_mlp": 1.03338695, + "epoch": 0.6562856971081584, + "flos": 22858923245760.0, + "grad_norm": 2.028697485237683, + "language_loss": 0.79404795, + "learning_rate": 1.1163251394461442e-06, + "loss": 0.82161546, + "num_input_tokens_seen": 117438330, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.34960938, + "step": 5458, + "time_per_iteration": 3.8325204849243164 + }, + { + "auxiliary_loss_clip": 0.0149528, + "auxiliary_loss_mlp": 0.01267689, + "balance_loss_clip": 1.13866007, + "balance_loss_mlp": 1.02869844, + "epoch": 0.6564059399987976, + "flos": 18874151045280.0, + "grad_norm": 2.51398830145666, + "language_loss": 0.82077491, + "learning_rate": 1.1156263948133746e-06, + "loss": 0.84840465, + "num_input_tokens_seen": 117454985, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.38476562, + "step": 5459, + "time_per_iteration": 2.9782721996307373 + }, + { + "auxiliary_loss_clip": 0.01487011, + "auxiliary_loss_mlp": 0.01264133, + "balance_loss_clip": 1.13013697, + "balance_loss_mlp": 1.02456975, + "epoch": 0.6565261828894366, + "flos": 25486574473440.0, + "grad_norm": 1.934340798874292, + "language_loss": 0.77679765, + "learning_rate": 1.1149277843393787e-06, + "loss": 0.80430913, + "num_input_tokens_seen": 117476145, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.390625, + "step": 5460, + "time_per_iteration": 3.084031820297241 + }, + { + "auxiliary_loss_clip": 0.01487544, + "auxiliary_loss_mlp": 0.01269827, + "balance_loss_clip": 1.12896788, + "balance_loss_mlp": 1.03217125, + "epoch": 0.6566464257800757, + "flos": 19685594115360.0, + "grad_norm": 2.2507771200570077, + "language_loss": 0.63329065, + "learning_rate": 1.1142293081301342e-06, + "loss": 0.66086435, + "num_input_tokens_seen": 117494025, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.37109375, + "step": 5461, + "time_per_iteration": 3.1357343196868896 + }, + { + "auxiliary_loss_clip": 0.01485899, + "auxiliary_loss_mlp": 0.01264909, + "balance_loss_clip": 1.12805605, + "balance_loss_mlp": 1.03068697, + "epoch": 0.6567666686707149, + "flos": 23516414457120.0, + "grad_norm": 3.2786622691482297, + "language_loss": 0.67896193, + "learning_rate": 1.1135309662915995e-06, + "loss": 0.70647001, + "num_input_tokens_seen": 117514190, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.33789062, + "step": 5462, + "time_per_iteration": 3.0770044326782227 + }, + { + "auxiliary_loss_clip": 0.014796, + "auxiliary_loss_mlp": 0.01278709, + "balance_loss_clip": 1.12125301, + "balance_loss_mlp": 1.03876448, + "epoch": 0.6568869115613539, + "flos": 32784139105920.0, + "grad_norm": 2.3039994580639935, + "language_loss": 0.60651088, + "learning_rate": 1.112832758929712e-06, + "loss": 0.63409388, + "num_input_tokens_seen": 117536800, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.39453125, + "step": 5463, + "time_per_iteration": 3.1054580211639404 + }, + { + "auxiliary_loss_clip": 0.01482674, + "auxiliary_loss_mlp": 0.012597, + "balance_loss_clip": 1.12539744, + "balance_loss_mlp": 1.01956439, + "epoch": 0.657007154451993, + "flos": 18444420620640.0, + "grad_norm": 1.9299446709171233, + "language_loss": 0.74971724, + "learning_rate": 1.11213468615039e-06, + "loss": 0.77714097, + "num_input_tokens_seen": 117556230, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.39648438, + "step": 5464, + "time_per_iteration": 3.0336859226226807 + }, + { + "auxiliary_loss_clip": 0.01486058, + "auxiliary_loss_mlp": 0.01265266, + "balance_loss_clip": 1.12937641, + "balance_loss_mlp": 1.0298996, + "epoch": 0.6571273973426321, + "flos": 25159137389280.0, + "grad_norm": 1.7101960580578552, + "language_loss": 0.75686419, + "learning_rate": 1.1114367480595292e-06, + "loss": 0.78437746, + "num_input_tokens_seen": 117577310, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.34765625, + "step": 5465, + "time_per_iteration": 2.999100685119629 + }, + { + "auxiliary_loss_clip": 0.01485196, + "auxiliary_loss_mlp": 0.01281641, + "balance_loss_clip": 1.12684917, + "balance_loss_mlp": 1.04207826, + "epoch": 0.6572476402332712, + "flos": 17531746198560.0, + "grad_norm": 2.3828480579646625, + "language_loss": 0.816329, + "learning_rate": 1.1107389447630086e-06, + "loss": 0.84399748, + "num_input_tokens_seen": 117596010, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.390625, + "step": 5466, + "time_per_iteration": 2.9728610515594482 + }, + { + "auxiliary_loss_clip": 0.01482075, + "auxiliary_loss_mlp": 0.01265915, + "balance_loss_clip": 1.12455356, + "balance_loss_mlp": 1.02902222, + "epoch": 0.6573678831239103, + "flos": 17015880193920.0, + "grad_norm": 3.127737089220538, + "language_loss": 0.78375572, + "learning_rate": 1.1100412763666818e-06, + "loss": 0.81123567, + "num_input_tokens_seen": 117611270, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.36328125, + "step": 5467, + "time_per_iteration": 3.0162155628204346 + }, + { + "auxiliary_loss_clip": 0.0148449, + "auxiliary_loss_mlp": 0.01271703, + "balance_loss_clip": 1.12803614, + "balance_loss_mlp": 1.03328443, + "epoch": 0.6574881260145494, + "flos": 23912085029760.0, + "grad_norm": 1.578124994679298, + "language_loss": 0.79935479, + "learning_rate": 1.1093437429763865e-06, + "loss": 0.82691669, + "num_input_tokens_seen": 117631535, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.37890625, + "step": 5468, + "time_per_iteration": 2.95274019241333 + }, + { + "auxiliary_loss_clip": 0.01484897, + "auxiliary_loss_mlp": 0.01267934, + "balance_loss_clip": 1.12770855, + "balance_loss_mlp": 1.03008771, + "epoch": 0.6576083689051885, + "flos": 11220058065600.0, + "grad_norm": 2.3006147915350654, + "language_loss": 0.73801208, + "learning_rate": 1.1086463446979361e-06, + "loss": 0.76554036, + "num_input_tokens_seen": 117649885, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.37304688, + "step": 5469, + "time_per_iteration": 2.965156078338623 + }, + { + "auxiliary_loss_clip": 0.01488939, + "auxiliary_loss_mlp": 0.0125901, + "balance_loss_clip": 1.13283396, + "balance_loss_mlp": 1.01944661, + "epoch": 0.6577286117958275, + "flos": 22457753089920.0, + "grad_norm": 1.9253579515937367, + "language_loss": 0.77476227, + "learning_rate": 1.1079490816371277e-06, + "loss": 0.8022418, + "num_input_tokens_seen": 117669650, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.39257812, + "step": 5470, + "time_per_iteration": 3.0451111793518066 + }, + { + "auxiliary_loss_clip": 0.01477802, + "auxiliary_loss_mlp": 0.01262331, + "balance_loss_clip": 1.12040401, + "balance_loss_mlp": 1.02448499, + "epoch": 0.6578488546864667, + "flos": 21874374231840.0, + "grad_norm": 4.058233616597672, + "language_loss": 0.74689764, + "learning_rate": 1.1072519538997352e-06, + "loss": 0.77429897, + "num_input_tokens_seen": 117688790, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.37304688, + "step": 5471, + "time_per_iteration": 3.004537343978882 + }, + { + "auxiliary_loss_clip": 0.01484163, + "auxiliary_loss_mlp": 0.01263347, + "balance_loss_clip": 1.12772322, + "balance_loss_mlp": 1.02416503, + "epoch": 0.6579690975771058, + "flos": 23545657432800.0, + "grad_norm": 2.346979567150296, + "language_loss": 0.82176375, + "learning_rate": 1.1065549615915095e-06, + "loss": 0.84923887, + "num_input_tokens_seen": 117708620, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.38671875, + "step": 5472, + "time_per_iteration": 3.7899422645568848 + }, + { + "auxiliary_loss_clip": 0.01487274, + "auxiliary_loss_mlp": 0.01267279, + "balance_loss_clip": 1.1295737, + "balance_loss_mlp": 1.0300045, + "epoch": 0.6580893404677448, + "flos": 32746741575840.0, + "grad_norm": 2.258566510076315, + "language_loss": 0.7832256, + "learning_rate": 1.105858104818187e-06, + "loss": 0.81077111, + "num_input_tokens_seen": 117729775, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.3671875, + "step": 5473, + "time_per_iteration": 3.0141477584838867 + }, + { + "auxiliary_loss_clip": 0.01483242, + "auxiliary_loss_mlp": 0.01283556, + "balance_loss_clip": 1.1250416, + "balance_loss_mlp": 1.04361129, + "epoch": 0.658209583358384, + "flos": 15889971470400.0, + "grad_norm": 3.677046621665683, + "language_loss": 0.7501117, + "learning_rate": 1.105161383685478e-06, + "loss": 0.7777797, + "num_input_tokens_seen": 117746160, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.39453125, + "step": 5474, + "time_per_iteration": 3.9404945373535156 + }, + { + "auxiliary_loss_clip": 0.01444032, + "auxiliary_loss_mlp": 0.01200256, + "balance_loss_clip": 1.09712756, + "balance_loss_mlp": 1.00494385, + "epoch": 0.658329826249023, + "flos": 62702208020160.0, + "grad_norm": 0.7330090856394887, + "language_loss": 0.56279945, + "learning_rate": 1.1044647982990771e-06, + "loss": 0.58924234, + "num_input_tokens_seen": 117808045, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 1.953125, + "step": 5475, + "time_per_iteration": 3.38917875289917 + }, + { + "auxiliary_loss_clip": 0.01487151, + "auxiliary_loss_mlp": 0.01274125, + "balance_loss_clip": 1.13043976, + "balance_loss_mlp": 1.03284502, + "epoch": 0.6584500691396621, + "flos": 31725326026080.0, + "grad_norm": 4.300286621848699, + "language_loss": 0.64638579, + "learning_rate": 1.1037683487646536e-06, + "loss": 0.67399853, + "num_input_tokens_seen": 117828330, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.40820312, + "step": 5476, + "time_per_iteration": 3.0455985069274902 + }, + { + "auxiliary_loss_clip": 0.0148086, + "auxiliary_loss_mlp": 0.01268709, + "balance_loss_clip": 1.12446153, + "balance_loss_mlp": 1.03238833, + "epoch": 0.6585703120303013, + "flos": 18408085079040.0, + "grad_norm": 1.9803041439699869, + "language_loss": 0.77148652, + "learning_rate": 1.1030720351878583e-06, + "loss": 0.79898226, + "num_input_tokens_seen": 117846450, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.35742188, + "step": 5477, + "time_per_iteration": 2.9852020740509033 + }, + { + "auxiliary_loss_clip": 0.01443417, + "auxiliary_loss_mlp": 0.01195808, + "balance_loss_clip": 1.0965476, + "balance_loss_mlp": 1.00049591, + "epoch": 0.6586905549209403, + "flos": 58315507100640.0, + "grad_norm": 0.8230116665853898, + "language_loss": 0.57597744, + "learning_rate": 1.102375857674323e-06, + "loss": 0.60236967, + "num_input_tokens_seen": 117908365, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 1.953125, + "step": 5478, + "time_per_iteration": 3.320739984512329 + }, + { + "auxiliary_loss_clip": 0.0148145, + "auxiliary_loss_mlp": 0.01261216, + "balance_loss_clip": 1.12444019, + "balance_loss_mlp": 1.02565801, + "epoch": 0.6588107978115794, + "flos": 22784772964320.0, + "grad_norm": 1.8327768631527555, + "language_loss": 0.90565372, + "learning_rate": 1.1016798163296561e-06, + "loss": 0.93308043, + "num_input_tokens_seen": 117927565, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.34960938, + "step": 5479, + "time_per_iteration": 3.856621503829956 + }, + { + "auxiliary_loss_clip": 0.01482348, + "auxiliary_loss_mlp": 0.01269609, + "balance_loss_clip": 1.12399817, + "balance_loss_mlp": 1.03138125, + "epoch": 0.6589310407022185, + "flos": 20669612135040.0, + "grad_norm": 2.3759046554647596, + "language_loss": 0.66248691, + "learning_rate": 1.1009839112594471e-06, + "loss": 0.69000649, + "num_input_tokens_seen": 117945590, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.37695312, + "step": 5480, + "time_per_iteration": 3.1288249492645264 + }, + { + "auxiliary_loss_clip": 0.01480245, + "auxiliary_loss_mlp": 0.01268296, + "balance_loss_clip": 1.12243903, + "balance_loss_mlp": 1.03006816, + "epoch": 0.6590512835928576, + "flos": 25633016556480.0, + "grad_norm": 2.1844798937363783, + "language_loss": 0.71842319, + "learning_rate": 1.1002881425692638e-06, + "loss": 0.74590856, + "num_input_tokens_seen": 117966020, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.37695312, + "step": 5481, + "time_per_iteration": 2.9645347595214844 + }, + { + "auxiliary_loss_clip": 0.0148005, + "auxiliary_loss_mlp": 0.01272548, + "balance_loss_clip": 1.1231693, + "balance_loss_mlp": 1.03603733, + "epoch": 0.6591715264834966, + "flos": 23728017847680.0, + "grad_norm": 2.1926116923842907, + "language_loss": 0.75165987, + "learning_rate": 1.0995925103646532e-06, + "loss": 0.77918583, + "num_input_tokens_seen": 117984620, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.359375, + "step": 5482, + "time_per_iteration": 2.9956929683685303 + }, + { + "auxiliary_loss_clip": 0.01484591, + "auxiliary_loss_mlp": 0.01264102, + "balance_loss_clip": 1.12720871, + "balance_loss_mlp": 1.02835393, + "epoch": 0.6592917693741358, + "flos": 35776852516800.0, + "grad_norm": 1.5678669775828753, + "language_loss": 0.66607547, + "learning_rate": 1.0988970147511437e-06, + "loss": 0.69356239, + "num_input_tokens_seen": 118006500, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.3515625, + "step": 5483, + "time_per_iteration": 3.083815574645996 + }, + { + "auxiliary_loss_clip": 0.01486695, + "auxiliary_loss_mlp": 0.01278814, + "balance_loss_clip": 1.12990439, + "balance_loss_mlp": 1.03810656, + "epoch": 0.6594120122647749, + "flos": 21398940010080.0, + "grad_norm": 2.3325832062864844, + "language_loss": 0.80872232, + "learning_rate": 1.0982016558342405e-06, + "loss": 0.83637738, + "num_input_tokens_seen": 118025470, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.40234375, + "step": 5484, + "time_per_iteration": 3.037754774093628 + }, + { + "auxiliary_loss_clip": 0.01484371, + "auxiliary_loss_mlp": 0.01270675, + "balance_loss_clip": 1.128654, + "balance_loss_mlp": 1.03073049, + "epoch": 0.6595322551554139, + "flos": 19353719436480.0, + "grad_norm": 1.9416493222478899, + "language_loss": 0.71582544, + "learning_rate": 1.0975064337194291e-06, + "loss": 0.7433759, + "num_input_tokens_seen": 118043515, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.39453125, + "step": 5485, + "time_per_iteration": 3.966355085372925 + }, + { + "auxiliary_loss_clip": 0.01482115, + "auxiliary_loss_mlp": 0.01268116, + "balance_loss_clip": 1.12454855, + "balance_loss_mlp": 1.03026962, + "epoch": 0.6596524980460531, + "flos": 16839019362240.0, + "grad_norm": 2.6420912223658757, + "language_loss": 0.70409536, + "learning_rate": 1.0968113485121743e-06, + "loss": 0.7315976, + "num_input_tokens_seen": 118063105, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.37304688, + "step": 5486, + "time_per_iteration": 3.004323959350586 + }, + { + "auxiliary_loss_clip": 0.01480339, + "auxiliary_loss_mlp": 0.01263501, + "balance_loss_clip": 1.12263322, + "balance_loss_mlp": 1.02355647, + "epoch": 0.6597727409366921, + "flos": 21800717016480.0, + "grad_norm": 1.9285329990830944, + "language_loss": 0.80262697, + "learning_rate": 1.0961164003179185e-06, + "loss": 0.83006531, + "num_input_tokens_seen": 118081615, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.39453125, + "step": 5487, + "time_per_iteration": 2.964796304702759 + }, + { + "auxiliary_loss_clip": 0.01484895, + "auxiliary_loss_mlp": 0.01264052, + "balance_loss_clip": 1.12913895, + "balance_loss_mlp": 1.02544284, + "epoch": 0.6598929838273312, + "flos": 23732872652160.0, + "grad_norm": 2.1822185838518506, + "language_loss": 0.84719718, + "learning_rate": 1.0954215892420884e-06, + "loss": 0.87468672, + "num_input_tokens_seen": 118102315, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.38085938, + "step": 5488, + "time_per_iteration": 2.983031749725342 + }, + { + "auxiliary_loss_clip": 0.01485445, + "auxiliary_loss_mlp": 0.0127848, + "balance_loss_clip": 1.13011456, + "balance_loss_mlp": 1.03834438, + "epoch": 0.6600132267179702, + "flos": 19976581873440.0, + "grad_norm": 1.7757654226080584, + "language_loss": 0.7101357, + "learning_rate": 1.094726915390082e-06, + "loss": 0.73777497, + "num_input_tokens_seen": 118120650, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.39648438, + "step": 5489, + "time_per_iteration": 2.964911937713623 + }, + { + "auxiliary_loss_clip": 0.01479791, + "auxiliary_loss_mlp": 0.01266825, + "balance_loss_clip": 1.12252736, + "balance_loss_mlp": 1.02878845, + "epoch": 0.6601334696086094, + "flos": 22344650223840.0, + "grad_norm": 2.2323736667658465, + "language_loss": 0.70120746, + "learning_rate": 1.0940323788672836e-06, + "loss": 0.72867358, + "num_input_tokens_seen": 118139825, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.375, + "step": 5490, + "time_per_iteration": 2.9282443523406982 + }, + { + "auxiliary_loss_clip": 0.01479282, + "auxiliary_loss_mlp": 0.01260483, + "balance_loss_clip": 1.12280941, + "balance_loss_mlp": 1.02492523, + "epoch": 0.6602537124992485, + "flos": 25705497998880.0, + "grad_norm": 1.7143908602581552, + "language_loss": 0.73650026, + "learning_rate": 1.0933379797790522e-06, + "loss": 0.7638979, + "num_input_tokens_seen": 118159240, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.34960938, + "step": 5491, + "time_per_iteration": 2.957448720932007 + }, + { + "auxiliary_loss_clip": 0.01482935, + "auxiliary_loss_mlp": 0.01262006, + "balance_loss_clip": 1.12706614, + "balance_loss_mlp": 1.02282405, + "epoch": 0.6603739553898875, + "flos": 25850953949760.0, + "grad_norm": 2.352859435596019, + "language_loss": 0.71776652, + "learning_rate": 1.0926437182307293e-06, + "loss": 0.74521589, + "num_input_tokens_seen": 118178050, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.38671875, + "step": 5492, + "time_per_iteration": 2.960826873779297 + }, + { + "auxiliary_loss_clip": 0.01484429, + "auxiliary_loss_mlp": 0.01261249, + "balance_loss_clip": 1.12858486, + "balance_loss_mlp": 1.02225852, + "epoch": 0.6604941982805267, + "flos": 24572534637600.0, + "grad_norm": 2.0245842015565847, + "language_loss": 0.77953368, + "learning_rate": 1.0919495943276338e-06, + "loss": 0.8069905, + "num_input_tokens_seen": 118199070, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.38476562, + "step": 5493, + "time_per_iteration": 3.0403220653533936 + }, + { + "auxiliary_loss_clip": 0.01482791, + "auxiliary_loss_mlp": 0.01276768, + "balance_loss_clip": 1.12774658, + "balance_loss_mlp": 1.03491592, + "epoch": 0.6606144411711657, + "flos": 13263496015680.0, + "grad_norm": 2.7661549931708103, + "language_loss": 0.76275373, + "learning_rate": 1.0912556081750611e-06, + "loss": 0.7903493, + "num_input_tokens_seen": 118217000, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.4140625, + "step": 5494, + "time_per_iteration": 2.976227045059204 + }, + { + "auxiliary_loss_clip": 0.01484605, + "auxiliary_loss_mlp": 0.01260238, + "balance_loss_clip": 1.1300956, + "balance_loss_mlp": 1.02296376, + "epoch": 0.6607346840618048, + "flos": 25157620262880.0, + "grad_norm": 2.1816467002824718, + "language_loss": 0.76196051, + "learning_rate": 1.0905617598782909e-06, + "loss": 0.78940892, + "num_input_tokens_seen": 118237205, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.3671875, + "step": 5495, + "time_per_iteration": 3.0380189418792725 + }, + { + "auxiliary_loss_clip": 0.01480188, + "auxiliary_loss_mlp": 0.01274275, + "balance_loss_clip": 1.12324214, + "balance_loss_mlp": 1.03490305, + "epoch": 0.660854926952444, + "flos": 17639880475680.0, + "grad_norm": 10.477313090500248, + "language_loss": 0.81359279, + "learning_rate": 1.0898680495425775e-06, + "loss": 0.84113741, + "num_input_tokens_seen": 118255495, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.38867188, + "step": 5496, + "time_per_iteration": 3.128028154373169 + }, + { + "auxiliary_loss_clip": 0.01486197, + "auxiliary_loss_mlp": 0.01261322, + "balance_loss_clip": 1.13039231, + "balance_loss_mlp": 1.02309465, + "epoch": 0.660975169843083, + "flos": 16837843589280.0, + "grad_norm": 1.9399084022535258, + "language_loss": 0.80270457, + "learning_rate": 1.0891744772731594e-06, + "loss": 0.83017981, + "num_input_tokens_seen": 118273310, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.37695312, + "step": 5497, + "time_per_iteration": 3.0690789222717285 + }, + { + "auxiliary_loss_clip": 0.01481911, + "auxiliary_loss_mlp": 0.01265066, + "balance_loss_clip": 1.12503588, + "balance_loss_mlp": 1.0281738, + "epoch": 0.6610954127337221, + "flos": 26872862565600.0, + "grad_norm": 1.5426721139300061, + "language_loss": 0.66090679, + "learning_rate": 1.088481043175248e-06, + "loss": 0.68837655, + "num_input_tokens_seen": 118293880, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.36328125, + "step": 5498, + "time_per_iteration": 3.0274102687835693 + }, + { + "auxiliary_loss_clip": 0.01481191, + "auxiliary_loss_mlp": 0.01271031, + "balance_loss_clip": 1.12412143, + "balance_loss_mlp": 1.03356588, + "epoch": 0.6612156556243612, + "flos": 26467937521920.0, + "grad_norm": 2.0143387347462185, + "language_loss": 0.75828612, + "learning_rate": 1.0877877473540368e-06, + "loss": 0.78580832, + "num_input_tokens_seen": 118314465, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.36914062, + "step": 5499, + "time_per_iteration": 4.001155376434326 + }, + { + "auxiliary_loss_clip": 0.01480798, + "auxiliary_loss_mlp": 0.01270353, + "balance_loss_clip": 1.1244874, + "balance_loss_mlp": 1.03193414, + "epoch": 0.6613358985150003, + "flos": 19793955961440.0, + "grad_norm": 1.8854963959342415, + "language_loss": 0.72442847, + "learning_rate": 1.0870945899147002e-06, + "loss": 0.75194001, + "num_input_tokens_seen": 118331110, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.37890625, + "step": 5500, + "time_per_iteration": 3.0893490314483643 + }, + { + "auxiliary_loss_clip": 0.01484806, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 1.12996459, + "balance_loss_mlp": 1.02932906, + "epoch": 0.6614561414056394, + "flos": 26833947909120.0, + "grad_norm": 2.55345824485433, + "language_loss": 0.7610864, + "learning_rate": 1.0864015709623879e-06, + "loss": 0.78864056, + "num_input_tokens_seen": 118351980, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.40820312, + "step": 5501, + "time_per_iteration": 3.080557346343994 + }, + { + "auxiliary_loss_clip": 0.01481865, + "auxiliary_loss_mlp": 0.01270716, + "balance_loss_clip": 1.12658453, + "balance_loss_mlp": 1.03439558, + "epoch": 0.6615763842962785, + "flos": 22896889698240.0, + "grad_norm": 3.425561898389379, + "language_loss": 0.80317974, + "learning_rate": 1.0857086906022313e-06, + "loss": 0.83070552, + "num_input_tokens_seen": 118370315, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.35742188, + "step": 5502, + "time_per_iteration": 3.0873517990112305 + }, + { + "auxiliary_loss_clip": 0.01488674, + "auxiliary_loss_mlp": 0.0126206, + "balance_loss_clip": 1.13294625, + "balance_loss_mlp": 1.02554893, + "epoch": 0.6616966271869176, + "flos": 24792633936000.0, + "grad_norm": 1.9658988638267023, + "language_loss": 0.73148501, + "learning_rate": 1.0850159489393388e-06, + "loss": 0.75899231, + "num_input_tokens_seen": 118389575, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.36132812, + "step": 5503, + "time_per_iteration": 4.035480260848999 + }, + { + "auxiliary_loss_clip": 0.01478509, + "auxiliary_loss_mlp": 0.01271241, + "balance_loss_clip": 1.12137747, + "balance_loss_mlp": 1.03282213, + "epoch": 0.6618168700775566, + "flos": 17204043617280.0, + "grad_norm": 2.5944641843339973, + "language_loss": 0.81952882, + "learning_rate": 1.0843233460787992e-06, + "loss": 0.84702629, + "num_input_tokens_seen": 118406790, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.37890625, + "step": 5504, + "time_per_iteration": 2.971224069595337 + }, + { + "auxiliary_loss_clip": 0.01481112, + "auxiliary_loss_mlp": 0.01259803, + "balance_loss_clip": 1.12570572, + "balance_loss_mlp": 1.02252889, + "epoch": 0.6619371129681958, + "flos": 25449442440480.0, + "grad_norm": 1.8263530641154782, + "language_loss": 0.77519572, + "learning_rate": 1.0836308821256805e-06, + "loss": 0.80260485, + "num_input_tokens_seen": 118427590, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.3671875, + "step": 5505, + "time_per_iteration": 3.1911940574645996 + }, + { + "auxiliary_loss_clip": 0.01481736, + "auxiliary_loss_mlp": 0.01262913, + "balance_loss_clip": 1.12541187, + "balance_loss_mlp": 1.02678299, + "epoch": 0.6620573558588349, + "flos": 18042378117120.0, + "grad_norm": 2.125952691158624, + "language_loss": 0.77640074, + "learning_rate": 1.0829385571850282e-06, + "loss": 0.80384731, + "num_input_tokens_seen": 118444570, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.35546875, + "step": 5506, + "time_per_iteration": 3.8461358547210693 + }, + { + "auxiliary_loss_clip": 0.01482185, + "auxiliary_loss_mlp": 0.01280425, + "balance_loss_clip": 1.12620664, + "balance_loss_mlp": 1.03876376, + "epoch": 0.6621775987494739, + "flos": 17787536259840.0, + "grad_norm": 13.368375469047178, + "language_loss": 0.83728218, + "learning_rate": 1.0822463713618679e-06, + "loss": 0.86490822, + "num_input_tokens_seen": 118461425, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.41210938, + "step": 5507, + "time_per_iteration": 3.0159616470336914 + }, + { + "auxiliary_loss_clip": 0.01482622, + "auxiliary_loss_mlp": 0.0126294, + "balance_loss_clip": 1.12627244, + "balance_loss_mlp": 1.0243305, + "epoch": 0.6622978416401131, + "flos": 17494462452960.0, + "grad_norm": 2.173061332941983, + "language_loss": 0.84852421, + "learning_rate": 1.0815543247612034e-06, + "loss": 0.8759799, + "num_input_tokens_seen": 118478495, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.38085938, + "step": 5508, + "time_per_iteration": 3.071484327316284 + }, + { + "auxiliary_loss_clip": 0.01479904, + "auxiliary_loss_mlp": 0.01260575, + "balance_loss_clip": 1.12264013, + "balance_loss_mlp": 1.02463603, + "epoch": 0.6624180845307521, + "flos": 21650254548480.0, + "grad_norm": 1.9547613230355514, + "language_loss": 0.83107221, + "learning_rate": 1.0808624174880168e-06, + "loss": 0.858477, + "num_input_tokens_seen": 118499145, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.35351562, + "step": 5509, + "time_per_iteration": 3.045828342437744 + }, + { + "auxiliary_loss_clip": 0.01481923, + "auxiliary_loss_mlp": 0.01267067, + "balance_loss_clip": 1.12757254, + "balance_loss_mlp": 1.029984, + "epoch": 0.6625383274213912, + "flos": 23808198706560.0, + "grad_norm": 1.8151840909321908, + "language_loss": 0.79781139, + "learning_rate": 1.080170649647272e-06, + "loss": 0.82530129, + "num_input_tokens_seen": 118518950, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.36523438, + "step": 5510, + "time_per_iteration": 3.025773525238037 + }, + { + "auxiliary_loss_clip": 0.01477355, + "auxiliary_loss_mlp": 0.01267193, + "balance_loss_clip": 1.12106788, + "balance_loss_mlp": 1.02896547, + "epoch": 0.6626585703120303, + "flos": 33266172827520.0, + "grad_norm": 1.6980341832585608, + "language_loss": 0.67337525, + "learning_rate": 1.0794790213439068e-06, + "loss": 0.70082068, + "num_input_tokens_seen": 118545850, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.37695312, + "step": 5511, + "time_per_iteration": 3.084710121154785 + }, + { + "auxiliary_loss_clip": 0.01483673, + "auxiliary_loss_mlp": 0.01262094, + "balance_loss_clip": 1.1269033, + "balance_loss_mlp": 1.02043307, + "epoch": 0.6627788132026694, + "flos": 22087684389600.0, + "grad_norm": 2.4170254474473056, + "language_loss": 0.78612125, + "learning_rate": 1.078787532682843e-06, + "loss": 0.81357896, + "num_input_tokens_seen": 118563325, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.41210938, + "step": 5512, + "time_per_iteration": 2.9401772022247314 + }, + { + "auxiliary_loss_clip": 0.01478487, + "auxiliary_loss_mlp": 0.01265558, + "balance_loss_clip": 1.12207019, + "balance_loss_mlp": 1.03076351, + "epoch": 0.6628990560933085, + "flos": 36176543474400.0, + "grad_norm": 2.28132724902147, + "language_loss": 0.75617933, + "learning_rate": 1.0780961837689773e-06, + "loss": 0.78361976, + "num_input_tokens_seen": 118582835, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.34375, + "step": 5513, + "time_per_iteration": 3.925950765609741 + }, + { + "auxiliary_loss_clip": 0.0148035, + "auxiliary_loss_mlp": 0.01260027, + "balance_loss_clip": 1.12238359, + "balance_loss_mlp": 1.02256203, + "epoch": 0.6630192989839476, + "flos": 18515460792960.0, + "grad_norm": 1.659712181016103, + "language_loss": 0.69983292, + "learning_rate": 1.0774049747071883e-06, + "loss": 0.72723669, + "num_input_tokens_seen": 118600715, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.36914062, + "step": 5514, + "time_per_iteration": 3.05661940574646 + }, + { + "auxiliary_loss_clip": 0.01481716, + "auxiliary_loss_mlp": 0.01263931, + "balance_loss_clip": 1.12346792, + "balance_loss_mlp": 1.02627528, + "epoch": 0.6631395418745867, + "flos": 35812467423360.0, + "grad_norm": 2.8604886350065883, + "language_loss": 0.68390518, + "learning_rate": 1.076713905602332e-06, + "loss": 0.71136165, + "num_input_tokens_seen": 118621290, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.37109375, + "step": 5515, + "time_per_iteration": 3.08505916595459 + }, + { + "auxiliary_loss_clip": 0.01485231, + "auxiliary_loss_mlp": 0.01268396, + "balance_loss_clip": 1.12962222, + "balance_loss_mlp": 1.02826095, + "epoch": 0.6632597847652257, + "flos": 20049821879040.0, + "grad_norm": 2.053168754631032, + "language_loss": 0.81186622, + "learning_rate": 1.07602297655924e-06, + "loss": 0.83940256, + "num_input_tokens_seen": 118639610, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.39648438, + "step": 5516, + "time_per_iteration": 3.1369497776031494 + }, + { + "auxiliary_loss_clip": 0.01485815, + "auxiliary_loss_mlp": 0.01264484, + "balance_loss_clip": 1.12795925, + "balance_loss_mlp": 1.02549362, + "epoch": 0.6633800276558649, + "flos": 21216579595200.0, + "grad_norm": 2.0726205420888393, + "language_loss": 0.81064439, + "learning_rate": 1.0753321876827292e-06, + "loss": 0.8381474, + "num_input_tokens_seen": 118658895, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.38671875, + "step": 5517, + "time_per_iteration": 2.9966883659362793 + }, + { + "auxiliary_loss_clip": 0.01478069, + "auxiliary_loss_mlp": 0.01263126, + "balance_loss_clip": 1.11952782, + "balance_loss_mlp": 1.02470708, + "epoch": 0.663500270546504, + "flos": 23990028127200.0, + "grad_norm": 2.2408769252440814, + "language_loss": 0.73772281, + "learning_rate": 1.0746415390775893e-06, + "loss": 0.76513475, + "num_input_tokens_seen": 118677025, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.37890625, + "step": 5518, + "time_per_iteration": 3.202188730239868 + }, + { + "auxiliary_loss_clip": 0.01481921, + "auxiliary_loss_mlp": 0.01268468, + "balance_loss_clip": 1.12502456, + "balance_loss_mlp": 1.02985835, + "epoch": 0.663620513437143, + "flos": 17934395552640.0, + "grad_norm": 1.9325297520582874, + "language_loss": 0.76533842, + "learning_rate": 1.0739510308485939e-06, + "loss": 0.79284233, + "num_input_tokens_seen": 118694240, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.38085938, + "step": 5519, + "time_per_iteration": 3.1696970462799072 + }, + { + "auxiliary_loss_clip": 0.01446824, + "auxiliary_loss_mlp": 0.01198593, + "balance_loss_clip": 1.10054183, + "balance_loss_mlp": 1.00366211, + "epoch": 0.6637407563277821, + "flos": 57846103456320.0, + "grad_norm": 0.8085667752887136, + "language_loss": 0.62546962, + "learning_rate": 1.07326066310049e-06, + "loss": 0.65192372, + "num_input_tokens_seen": 118758365, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 1.94921875, + "step": 5520, + "time_per_iteration": 3.55617094039917 + }, + { + "auxiliary_loss_clip": 0.01483361, + "auxiliary_loss_mlp": 0.01276467, + "balance_loss_clip": 1.1253221, + "balance_loss_mlp": 1.03518724, + "epoch": 0.6638609992184212, + "flos": 27308358070560.0, + "grad_norm": 5.868986891158751, + "language_loss": 0.79750454, + "learning_rate": 1.0725704359380059e-06, + "loss": 0.82510281, + "num_input_tokens_seen": 118778220, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.40820312, + "step": 5521, + "time_per_iteration": 3.058495283126831 + }, + { + "auxiliary_loss_clip": 0.01485712, + "auxiliary_loss_mlp": 0.01263726, + "balance_loss_clip": 1.12977815, + "balance_loss_mlp": 1.02492642, + "epoch": 0.6639812421090603, + "flos": 18626856891840.0, + "grad_norm": 1.9141749718385495, + "language_loss": 0.72405136, + "learning_rate": 1.0718803494658497e-06, + "loss": 0.75154579, + "num_input_tokens_seen": 118797110, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.3828125, + "step": 5522, + "time_per_iteration": 3.1195054054260254 + }, + { + "auxiliary_loss_clip": 0.01486449, + "auxiliary_loss_mlp": 0.01267768, + "balance_loss_clip": 1.13087296, + "balance_loss_mlp": 1.02820516, + "epoch": 0.6641014849996993, + "flos": 15926307012000.0, + "grad_norm": 2.2683664347000985, + "language_loss": 0.84445536, + "learning_rate": 1.071190403788707e-06, + "loss": 0.87199748, + "num_input_tokens_seen": 118812415, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.390625, + "step": 5523, + "time_per_iteration": 3.057751178741455 + }, + { + "auxiliary_loss_clip": 0.01487419, + "auxiliary_loss_mlp": 0.01275002, + "balance_loss_clip": 1.13177228, + "balance_loss_mlp": 1.03410387, + "epoch": 0.6642217278903385, + "flos": 26507914166880.0, + "grad_norm": 1.8495267880557171, + "language_loss": 0.76007438, + "learning_rate": 1.0705005990112415e-06, + "loss": 0.78769857, + "num_input_tokens_seen": 118832195, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.40429688, + "step": 5524, + "time_per_iteration": 3.0325069427490234 + }, + { + "auxiliary_loss_clip": 0.01491721, + "auxiliary_loss_mlp": 0.01275979, + "balance_loss_clip": 1.13458896, + "balance_loss_mlp": 1.03756034, + "epoch": 0.6643419707809776, + "flos": 15379111982880.0, + "grad_norm": 2.3690700822474984, + "language_loss": 0.74718046, + "learning_rate": 1.0698109352380957e-06, + "loss": 0.7748574, + "num_input_tokens_seen": 118849795, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.37890625, + "step": 5525, + "time_per_iteration": 2.9555604457855225 + }, + { + "auxiliary_loss_clip": 0.01485347, + "auxiliary_loss_mlp": 0.01262895, + "balance_loss_clip": 1.1285671, + "balance_loss_mlp": 1.02714658, + "epoch": 0.6644622136716166, + "flos": 25119881379360.0, + "grad_norm": 1.8927825932231208, + "language_loss": 0.77917612, + "learning_rate": 1.0691214125738909e-06, + "loss": 0.80665851, + "num_input_tokens_seen": 118870000, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.35351562, + "step": 5526, + "time_per_iteration": 3.1263797283172607 + }, + { + "auxiliary_loss_clip": 0.01449074, + "auxiliary_loss_mlp": 0.01200737, + "balance_loss_clip": 1.10262132, + "balance_loss_mlp": 1.00466156, + "epoch": 0.6645824565622558, + "flos": 66208397961600.0, + "grad_norm": 0.789599749375289, + "language_loss": 0.57443899, + "learning_rate": 1.0684320311232287e-06, + "loss": 0.60093707, + "num_input_tokens_seen": 118932905, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 1.95703125, + "step": 5527, + "time_per_iteration": 3.5790915489196777 + }, + { + "auxiliary_loss_clip": 0.01486519, + "auxiliary_loss_mlp": 0.01266347, + "balance_loss_clip": 1.13061523, + "balance_loss_mlp": 1.02621198, + "epoch": 0.6647026994528948, + "flos": 25084456113600.0, + "grad_norm": 4.873418144330801, + "language_loss": 0.8136524, + "learning_rate": 1.0677427909906865e-06, + "loss": 0.84118104, + "num_input_tokens_seen": 118953355, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.39648438, + "step": 5528, + "time_per_iteration": 3.9343783855438232 + }, + { + "auxiliary_loss_clip": 0.01487003, + "auxiliary_loss_mlp": 0.01269208, + "balance_loss_clip": 1.13112485, + "balance_loss_mlp": 1.02869153, + "epoch": 0.6648229423435339, + "flos": 18224093753280.0, + "grad_norm": 2.609497419410231, + "language_loss": 0.72290933, + "learning_rate": 1.0670536922808216e-06, + "loss": 0.75047147, + "num_input_tokens_seen": 118973480, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.40039062, + "step": 5529, + "time_per_iteration": 3.024156332015991 + }, + { + "auxiliary_loss_clip": 0.01487388, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 1.13050866, + "balance_loss_mlp": 1.0298543, + "epoch": 0.6649431852341731, + "flos": 18298888813440.0, + "grad_norm": 2.8522186832079077, + "language_loss": 0.72299415, + "learning_rate": 1.06636473509817e-06, + "loss": 0.75057173, + "num_input_tokens_seen": 118989860, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.40039062, + "step": 5530, + "time_per_iteration": 3.908731698989868 + }, + { + "auxiliary_loss_clip": 0.01493933, + "auxiliary_loss_mlp": 0.01271104, + "balance_loss_clip": 1.13833594, + "balance_loss_mlp": 1.03268504, + "epoch": 0.6650634281248121, + "flos": 17021493561600.0, + "grad_norm": 3.4957345744575314, + "language_loss": 0.80662191, + "learning_rate": 1.0656759195472447e-06, + "loss": 0.83427227, + "num_input_tokens_seen": 119007150, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.37890625, + "step": 5531, + "time_per_iteration": 2.987442970275879 + }, + { + "auxiliary_loss_clip": 0.01449331, + "auxiliary_loss_mlp": 0.0121167, + "balance_loss_clip": 1.10436177, + "balance_loss_mlp": 1.0140686, + "epoch": 0.6651836710154512, + "flos": 69301204879680.0, + "grad_norm": 0.7787545328402065, + "language_loss": 0.59720534, + "learning_rate": 1.0649872457325414e-06, + "loss": 0.62381536, + "num_input_tokens_seen": 119068435, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 1.96875, + "step": 5532, + "time_per_iteration": 3.4049768447875977 + }, + { + "auxiliary_loss_clip": 0.01448539, + "auxiliary_loss_mlp": 0.01212997, + "balance_loss_clip": 1.10352075, + "balance_loss_mlp": 1.01768494, + "epoch": 0.6653039139060903, + "flos": 66889518780960.0, + "grad_norm": 0.8980901392041146, + "language_loss": 0.54997468, + "learning_rate": 1.0642987137585278e-06, + "loss": 0.57659006, + "num_input_tokens_seen": 119127960, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 1.94921875, + "step": 5533, + "time_per_iteration": 4.203483819961548 + }, + { + "auxiliary_loss_clip": 0.01490612, + "auxiliary_loss_mlp": 0.01261824, + "balance_loss_clip": 1.13557172, + "balance_loss_mlp": 1.02226138, + "epoch": 0.6654241567967294, + "flos": 21472180015680.0, + "grad_norm": 1.9798625148915339, + "language_loss": 0.82548505, + "learning_rate": 1.0636103237296561e-06, + "loss": 0.85300946, + "num_input_tokens_seen": 119146885, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.390625, + "step": 5534, + "time_per_iteration": 2.9835894107818604 + }, + { + "auxiliary_loss_clip": 0.01494556, + "auxiliary_loss_mlp": 0.01273301, + "balance_loss_clip": 1.13948393, + "balance_loss_mlp": 1.0365994, + "epoch": 0.6655443996873684, + "flos": 25121891571840.0, + "grad_norm": 1.909731648598212, + "language_loss": 0.84271491, + "learning_rate": 1.062922075750353e-06, + "loss": 0.87039346, + "num_input_tokens_seen": 119166900, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.36132812, + "step": 5535, + "time_per_iteration": 2.9787261486053467 + }, + { + "auxiliary_loss_clip": 0.01495228, + "auxiliary_loss_mlp": 0.01262244, + "balance_loss_clip": 1.14007974, + "balance_loss_mlp": 1.02668607, + "epoch": 0.6656646425780076, + "flos": 17459264756160.0, + "grad_norm": 2.88393195994594, + "language_loss": 0.72128952, + "learning_rate": 1.0622339699250267e-06, + "loss": 0.74886417, + "num_input_tokens_seen": 119184820, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.3515625, + "step": 5536, + "time_per_iteration": 2.9870965480804443 + }, + { + "auxiliary_loss_clip": 0.01486496, + "auxiliary_loss_mlp": 0.01269368, + "balance_loss_clip": 1.13147104, + "balance_loss_mlp": 1.03113985, + "epoch": 0.6657848854686467, + "flos": 23436081885600.0, + "grad_norm": 1.9116093742225149, + "language_loss": 0.79317659, + "learning_rate": 1.0615460063580624e-06, + "loss": 0.82073522, + "num_input_tokens_seen": 119203295, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.37695312, + "step": 5537, + "time_per_iteration": 3.063063144683838 + }, + { + "auxiliary_loss_clip": 0.01486127, + "auxiliary_loss_mlp": 0.01267663, + "balance_loss_clip": 1.13055444, + "balance_loss_mlp": 1.03038907, + "epoch": 0.6659051283592857, + "flos": 11511500961600.0, + "grad_norm": 2.407096649415769, + "language_loss": 0.73326266, + "learning_rate": 1.060858185153821e-06, + "loss": 0.76080054, + "num_input_tokens_seen": 119221395, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.3671875, + "step": 5538, + "time_per_iteration": 2.970153331756592 + }, + { + "auxiliary_loss_clip": 0.01493043, + "auxiliary_loss_mlp": 0.01271812, + "balance_loss_clip": 1.13714981, + "balance_loss_mlp": 1.03091359, + "epoch": 0.6660253712499249, + "flos": 20596637626560.0, + "grad_norm": 3.6732851213450517, + "language_loss": 0.76648247, + "learning_rate": 1.0601705064166474e-06, + "loss": 0.79413104, + "num_input_tokens_seen": 119239790, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.40429688, + "step": 5539, + "time_per_iteration": 2.9888319969177246 + }, + { + "auxiliary_loss_clip": 0.01485023, + "auxiliary_loss_mlp": 0.01269624, + "balance_loss_clip": 1.1302495, + "balance_loss_mlp": 1.03101432, + "epoch": 0.666145614140564, + "flos": 21253635771840.0, + "grad_norm": 2.2764711630285457, + "language_loss": 0.73889309, + "learning_rate": 1.0594829702508596e-06, + "loss": 0.76643956, + "num_input_tokens_seen": 119257505, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.38085938, + "step": 5540, + "time_per_iteration": 3.9024813175201416 + }, + { + "auxiliary_loss_clip": 0.0148104, + "auxiliary_loss_mlp": 0.01268348, + "balance_loss_clip": 1.12430406, + "balance_loss_mlp": 1.03183675, + "epoch": 0.666265857031203, + "flos": 33728825259360.0, + "grad_norm": 2.4942106194490115, + "language_loss": 0.55192238, + "learning_rate": 1.0587955767607592e-06, + "loss": 0.57941628, + "num_input_tokens_seen": 119279365, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.359375, + "step": 5541, + "time_per_iteration": 3.148956537246704 + }, + { + "auxiliary_loss_clip": 0.01483648, + "auxiliary_loss_mlp": 0.01264989, + "balance_loss_clip": 1.12660146, + "balance_loss_mlp": 1.02695239, + "epoch": 0.6663860999218422, + "flos": 17458544121120.0, + "grad_norm": 2.1365713896110727, + "language_loss": 0.77318716, + "learning_rate": 1.0581083260506206e-06, + "loss": 0.80067354, + "num_input_tokens_seen": 119296150, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.375, + "step": 5542, + "time_per_iteration": 3.201096534729004 + }, + { + "auxiliary_loss_clip": 0.01484718, + "auxiliary_loss_mlp": 0.01277636, + "balance_loss_clip": 1.12849045, + "balance_loss_mlp": 1.03921747, + "epoch": 0.6665063428124812, + "flos": 17678719275840.0, + "grad_norm": 2.920211116250805, + "language_loss": 0.76579601, + "learning_rate": 1.0574212182246993e-06, + "loss": 0.79341954, + "num_input_tokens_seen": 119314845, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.37890625, + "step": 5543, + "time_per_iteration": 2.962296485900879 + }, + { + "auxiliary_loss_clip": 0.01486072, + "auxiliary_loss_mlp": 0.0127697, + "balance_loss_clip": 1.1294775, + "balance_loss_mlp": 1.0374074, + "epoch": 0.6666265857031203, + "flos": 27675771799680.0, + "grad_norm": 2.7158247466505987, + "language_loss": 0.76001149, + "learning_rate": 1.0567342533872303e-06, + "loss": 0.78764188, + "num_input_tokens_seen": 119334875, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.390625, + "step": 5544, + "time_per_iteration": 3.0157737731933594 + }, + { + "auxiliary_loss_clip": 0.01487927, + "auxiliary_loss_mlp": 0.0126431, + "balance_loss_clip": 1.13332808, + "balance_loss_mlp": 1.02589154, + "epoch": 0.6667468285937594, + "flos": 25049220488640.0, + "grad_norm": 1.8063714879655755, + "language_loss": 0.81079626, + "learning_rate": 1.0560474316424255e-06, + "loss": 0.83831871, + "num_input_tokens_seen": 119354635, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.37890625, + "step": 5545, + "time_per_iteration": 3.0593135356903076 + }, + { + "auxiliary_loss_clip": 0.01487557, + "auxiliary_loss_mlp": 0.01275041, + "balance_loss_clip": 1.13066006, + "balance_loss_mlp": 1.03509688, + "epoch": 0.6668670714843985, + "flos": 22782497274720.0, + "grad_norm": 3.2078549729455523, + "language_loss": 0.73907238, + "learning_rate": 1.0553607530944746e-06, + "loss": 0.76669836, + "num_input_tokens_seen": 119372690, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.39453125, + "step": 5546, + "time_per_iteration": 3.021055221557617 + }, + { + "auxiliary_loss_clip": 0.01487137, + "auxiliary_loss_mlp": 0.01272796, + "balance_loss_clip": 1.12994146, + "balance_loss_mlp": 1.03666651, + "epoch": 0.6669873143750376, + "flos": 22166082624960.0, + "grad_norm": 2.1073743948478656, + "language_loss": 0.89354229, + "learning_rate": 1.0546742178475463e-06, + "loss": 0.92114162, + "num_input_tokens_seen": 119391685, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.35546875, + "step": 5547, + "time_per_iteration": 3.0319736003875732 + }, + { + "auxiliary_loss_clip": 0.0148849, + "auxiliary_loss_mlp": 0.01257239, + "balance_loss_clip": 1.13301635, + "balance_loss_mlp": 1.02397037, + "epoch": 0.6671075572656767, + "flos": 20516532624000.0, + "grad_norm": 2.2125604122823406, + "language_loss": 0.86700422, + "learning_rate": 1.0539878260057868e-06, + "loss": 0.89446151, + "num_input_tokens_seen": 119410725, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.328125, + "step": 5548, + "time_per_iteration": 2.971181631088257 + }, + { + "auxiliary_loss_clip": 0.01488463, + "auxiliary_loss_mlp": 0.01274552, + "balance_loss_clip": 1.13237309, + "balance_loss_mlp": 1.03651476, + "epoch": 0.6672278001563158, + "flos": 17933143923360.0, + "grad_norm": 3.018658333056875, + "language_loss": 0.6889205, + "learning_rate": 1.0533015776733226e-06, + "loss": 0.71655065, + "num_input_tokens_seen": 119426875, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.375, + "step": 5549, + "time_per_iteration": 2.9412617683410645 + }, + { + "auxiliary_loss_clip": 0.01485937, + "auxiliary_loss_mlp": 0.01269499, + "balance_loss_clip": 1.12882864, + "balance_loss_mlp": 1.03298759, + "epoch": 0.6673480430469548, + "flos": 22344119229600.0, + "grad_norm": 2.765141860600108, + "language_loss": 0.78456187, + "learning_rate": 1.0526154729542566e-06, + "loss": 0.81211627, + "num_input_tokens_seen": 119446935, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.36132812, + "step": 5550, + "time_per_iteration": 2.984703779220581 + }, + { + "auxiliary_loss_clip": 0.01495444, + "auxiliary_loss_mlp": 0.0126208, + "balance_loss_clip": 1.13880372, + "balance_loss_mlp": 1.02480543, + "epoch": 0.6674682859375939, + "flos": 20705909748480.0, + "grad_norm": 3.2182523384660473, + "language_loss": 0.80100799, + "learning_rate": 1.0519295119526699e-06, + "loss": 0.82858318, + "num_input_tokens_seen": 119463240, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.3671875, + "step": 5551, + "time_per_iteration": 3.0739927291870117 + }, + { + "auxiliary_loss_clip": 0.01490984, + "auxiliary_loss_mlp": 0.01259005, + "balance_loss_clip": 1.13565373, + "balance_loss_mlp": 1.02153969, + "epoch": 0.667588528828233, + "flos": 26208695998080.0, + "grad_norm": 1.695984244697064, + "language_loss": 0.83216321, + "learning_rate": 1.0512436947726227e-06, + "loss": 0.85966313, + "num_input_tokens_seen": 119484655, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.36914062, + "step": 5552, + "time_per_iteration": 3.061248302459717 + }, + { + "auxiliary_loss_clip": 0.01488176, + "auxiliary_loss_mlp": 0.01269354, + "balance_loss_clip": 1.13289118, + "balance_loss_mlp": 1.02807426, + "epoch": 0.6677087717188721, + "flos": 23072991966720.0, + "grad_norm": 2.524214506322532, + "language_loss": 0.65402269, + "learning_rate": 1.0505580215181517e-06, + "loss": 0.68159795, + "num_input_tokens_seen": 119502895, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.40820312, + "step": 5553, + "time_per_iteration": 3.0863592624664307 + }, + { + "auxiliary_loss_clip": 0.01444334, + "auxiliary_loss_mlp": 0.01215652, + "balance_loss_clip": 1.10118771, + "balance_loss_mlp": 1.01881409, + "epoch": 0.6678290146095112, + "flos": 70948061981280.0, + "grad_norm": 0.7899500301174408, + "language_loss": 0.56599164, + "learning_rate": 1.0498724922932753e-06, + "loss": 0.59259152, + "num_input_tokens_seen": 119561010, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 1.96484375, + "step": 5554, + "time_per_iteration": 3.4131338596343994 + }, + { + "auxiliary_loss_clip": 0.01495869, + "auxiliary_loss_mlp": 0.01268901, + "balance_loss_clip": 1.13947606, + "balance_loss_mlp": 1.0283848, + "epoch": 0.6679492575001503, + "flos": 18663078648960.0, + "grad_norm": 2.2554860784470634, + "language_loss": 0.86626589, + "learning_rate": 1.0491871072019851e-06, + "loss": 0.89391363, + "num_input_tokens_seen": 119578900, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.40039062, + "step": 5555, + "time_per_iteration": 3.8278963565826416 + }, + { + "auxiliary_loss_clip": 0.01484178, + "auxiliary_loss_mlp": 0.01260651, + "balance_loss_clip": 1.12918282, + "balance_loss_mlp": 1.02299547, + "epoch": 0.6680695003907894, + "flos": 29714354945280.0, + "grad_norm": 1.7427213779989843, + "language_loss": 0.64034939, + "learning_rate": 1.0485018663482555e-06, + "loss": 0.66779768, + "num_input_tokens_seen": 119598920, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.37109375, + "step": 5556, + "time_per_iteration": 3.1409215927124023 + }, + { + "auxiliary_loss_clip": 0.01491665, + "auxiliary_loss_mlp": 0.01265347, + "balance_loss_clip": 1.13578761, + "balance_loss_mlp": 1.02807307, + "epoch": 0.6681897432814284, + "flos": 28221411774240.0, + "grad_norm": 2.61736267700078, + "language_loss": 0.7141149, + "learning_rate": 1.0478167698360354e-06, + "loss": 0.74168503, + "num_input_tokens_seen": 119618220, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.3671875, + "step": 5557, + "time_per_iteration": 3.0944721698760986 + }, + { + "auxiliary_loss_clip": 0.01482622, + "auxiliary_loss_mlp": 0.01274922, + "balance_loss_clip": 1.12521887, + "balance_loss_mlp": 1.03802907, + "epoch": 0.6683099861720676, + "flos": 25048917063360.0, + "grad_norm": 2.732650209845308, + "language_loss": 0.70559013, + "learning_rate": 1.0471318177692556e-06, + "loss": 0.73316562, + "num_input_tokens_seen": 119638520, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.36328125, + "step": 5558, + "time_per_iteration": 3.9048986434936523 + }, + { + "auxiliary_loss_clip": 0.01484998, + "auxiliary_loss_mlp": 0.01263514, + "balance_loss_clip": 1.12949491, + "balance_loss_mlp": 1.02433252, + "epoch": 0.6684302290627067, + "flos": 22998879613440.0, + "grad_norm": 2.311150095302681, + "language_loss": 0.76401424, + "learning_rate": 1.046447010251821e-06, + "loss": 0.79149932, + "num_input_tokens_seen": 119655850, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.38671875, + "step": 5559, + "time_per_iteration": 3.0337307453155518 + }, + { + "auxiliary_loss_clip": 0.01490862, + "auxiliary_loss_mlp": 0.01266048, + "balance_loss_clip": 1.1347878, + "balance_loss_mlp": 1.0295372, + "epoch": 0.6685504719533457, + "flos": 26575995942720.0, + "grad_norm": 1.6813676742318497, + "language_loss": 0.75731373, + "learning_rate": 1.0457623473876157e-06, + "loss": 0.78488284, + "num_input_tokens_seen": 119675355, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.359375, + "step": 5560, + "time_per_iteration": 3.006066083908081 + }, + { + "auxiliary_loss_clip": 0.01488716, + "auxiliary_loss_mlp": 0.01263615, + "balance_loss_clip": 1.13362598, + "balance_loss_mlp": 1.02691269, + "epoch": 0.6686707148439849, + "flos": 28988630245440.0, + "grad_norm": 2.5443757886528306, + "language_loss": 0.71125412, + "learning_rate": 1.0450778292805046e-06, + "loss": 0.73877746, + "num_input_tokens_seen": 119695340, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.36132812, + "step": 5561, + "time_per_iteration": 3.925631046295166 + }, + { + "auxiliary_loss_clip": 0.01490453, + "auxiliary_loss_mlp": 0.01277012, + "balance_loss_clip": 1.13386393, + "balance_loss_mlp": 1.03802073, + "epoch": 0.6687909577346239, + "flos": 23625421081920.0, + "grad_norm": 3.7857930488361773, + "language_loss": 0.78608787, + "learning_rate": 1.0443934560343267e-06, + "loss": 0.81376243, + "num_input_tokens_seen": 119716750, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.38476562, + "step": 5562, + "time_per_iteration": 3.0078020095825195 + }, + { + "auxiliary_loss_clip": 0.01489607, + "auxiliary_loss_mlp": 0.01263704, + "balance_loss_clip": 1.13255167, + "balance_loss_mlp": 1.02814674, + "epoch": 0.668911200625263, + "flos": 23150821279680.0, + "grad_norm": 1.9599505150208045, + "language_loss": 0.78420639, + "learning_rate": 1.0437092277529034e-06, + "loss": 0.8117395, + "num_input_tokens_seen": 119736005, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.34960938, + "step": 5563, + "time_per_iteration": 2.928755521774292 + }, + { + "auxiliary_loss_clip": 0.01491763, + "auxiliary_loss_mlp": 0.01262031, + "balance_loss_clip": 1.13490343, + "balance_loss_mlp": 1.02551961, + "epoch": 0.6690314435159022, + "flos": 18553920311520.0, + "grad_norm": 2.005296305932965, + "language_loss": 0.73394173, + "learning_rate": 1.0430251445400292e-06, + "loss": 0.76147974, + "num_input_tokens_seen": 119754050, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.359375, + "step": 5564, + "time_per_iteration": 3.015406847000122 + }, + { + "auxiliary_loss_clip": 0.01499984, + "auxiliary_loss_mlp": 0.01268016, + "balance_loss_clip": 1.14610767, + "balance_loss_mlp": 1.02749944, + "epoch": 0.6691516864065412, + "flos": 31762268418240.0, + "grad_norm": 4.642143746691244, + "language_loss": 0.62813228, + "learning_rate": 1.0423412064994787e-06, + "loss": 0.65581226, + "num_input_tokens_seen": 119774820, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.40039062, + "step": 5565, + "time_per_iteration": 3.068159580230713 + }, + { + "auxiliary_loss_clip": 0.01489112, + "auxiliary_loss_mlp": 0.01264936, + "balance_loss_clip": 1.13385212, + "balance_loss_mlp": 1.03052294, + "epoch": 0.6692719292971803, + "flos": 34936773321600.0, + "grad_norm": 2.558526152367802, + "language_loss": 0.74117726, + "learning_rate": 1.0416574137350064e-06, + "loss": 0.76871777, + "num_input_tokens_seen": 119795525, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.34179688, + "step": 5566, + "time_per_iteration": 3.0824272632598877 + }, + { + "auxiliary_loss_clip": 0.01496372, + "auxiliary_loss_mlp": 0.01261453, + "balance_loss_clip": 1.14027047, + "balance_loss_mlp": 1.02303469, + "epoch": 0.6693921721878194, + "flos": 20451295460160.0, + "grad_norm": 3.9166666928458453, + "language_loss": 0.80802846, + "learning_rate": 1.0409737663503428e-06, + "loss": 0.83560669, + "num_input_tokens_seen": 119813905, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.37890625, + "step": 5567, + "time_per_iteration": 3.8143417835235596 + }, + { + "auxiliary_loss_clip": 0.01480267, + "auxiliary_loss_mlp": 0.01278553, + "balance_loss_clip": 1.12420344, + "balance_loss_mlp": 1.03784561, + "epoch": 0.6695124150784585, + "flos": 16616227164480.0, + "grad_norm": 2.0102686263261105, + "language_loss": 0.82946414, + "learning_rate": 1.040290264449196e-06, + "loss": 0.85705233, + "num_input_tokens_seen": 119832010, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.40234375, + "step": 5568, + "time_per_iteration": 2.925337553024292 + }, + { + "auxiliary_loss_clip": 0.01488803, + "auxiliary_loss_mlp": 0.01257188, + "balance_loss_clip": 1.13253319, + "balance_loss_mlp": 1.02144015, + "epoch": 0.6696326579690975, + "flos": 26654621747040.0, + "grad_norm": 3.1147932534442835, + "language_loss": 0.63851595, + "learning_rate": 1.0396069081352532e-06, + "loss": 0.66597581, + "num_input_tokens_seen": 119851165, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.35351562, + "step": 5569, + "time_per_iteration": 3.078843355178833 + }, + { + "auxiliary_loss_clip": 0.01447026, + "auxiliary_loss_mlp": 0.012006, + "balance_loss_clip": 1.10450828, + "balance_loss_mlp": 1.00376129, + "epoch": 0.6697529008597367, + "flos": 66971140909920.0, + "grad_norm": 0.7721512019122118, + "language_loss": 0.56052071, + "learning_rate": 1.0389236975121782e-06, + "loss": 0.58699697, + "num_input_tokens_seen": 119906015, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 1.96484375, + "step": 5570, + "time_per_iteration": 3.380265235900879 + }, + { + "auxiliary_loss_clip": 0.01485587, + "auxiliary_loss_mlp": 0.0126686, + "balance_loss_clip": 1.13043475, + "balance_loss_mlp": 1.0290134, + "epoch": 0.6698731437503758, + "flos": 20889066654720.0, + "grad_norm": 2.4394915295158937, + "language_loss": 0.71656215, + "learning_rate": 1.0382406326836147e-06, + "loss": 0.74408662, + "num_input_tokens_seen": 119925160, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.37304688, + "step": 5571, + "time_per_iteration": 3.007997751235962 + }, + { + "auxiliary_loss_clip": 0.01487369, + "auxiliary_loss_mlp": 0.01272445, + "balance_loss_clip": 1.13080466, + "balance_loss_mlp": 1.03192782, + "epoch": 0.6699933866410148, + "flos": 20411394671520.0, + "grad_norm": 1.9816562883001356, + "language_loss": 0.75923908, + "learning_rate": 1.0375577137531828e-06, + "loss": 0.78683722, + "num_input_tokens_seen": 119943720, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.40234375, + "step": 5572, + "time_per_iteration": 3.055074691772461 + }, + { + "auxiliary_loss_clip": 0.01485021, + "auxiliary_loss_mlp": 0.01271684, + "balance_loss_clip": 1.12884116, + "balance_loss_mlp": 1.03250241, + "epoch": 0.670113629531654, + "flos": 29025648493920.0, + "grad_norm": 2.3423259309066746, + "language_loss": 0.71966332, + "learning_rate": 1.0368749408244802e-06, + "loss": 0.74723035, + "num_input_tokens_seen": 119966640, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.38671875, + "step": 5573, + "time_per_iteration": 3.0603418350219727 + }, + { + "auxiliary_loss_clip": 0.01492733, + "auxiliary_loss_mlp": 0.01279723, + "balance_loss_clip": 1.13746262, + "balance_loss_mlp": 1.04187632, + "epoch": 0.670233872422293, + "flos": 19793728392480.0, + "grad_norm": 2.0021561281002325, + "language_loss": 0.79069763, + "learning_rate": 1.0361923140010836e-06, + "loss": 0.8184222, + "num_input_tokens_seen": 119985125, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.37304688, + "step": 5574, + "time_per_iteration": 3.0251059532165527 + }, + { + "auxiliary_loss_clip": 0.01488031, + "auxiliary_loss_mlp": 0.01264977, + "balance_loss_clip": 1.13312137, + "balance_loss_mlp": 1.02827454, + "epoch": 0.6703541153129321, + "flos": 24246007829280.0, + "grad_norm": 2.2397215328924744, + "language_loss": 0.63311863, + "learning_rate": 1.0355098333865455e-06, + "loss": 0.6606487, + "num_input_tokens_seen": 120004355, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.36132812, + "step": 5575, + "time_per_iteration": 3.044689178466797 + }, + { + "auxiliary_loss_clip": 0.01495074, + "auxiliary_loss_mlp": 0.01261375, + "balance_loss_clip": 1.14080846, + "balance_loss_mlp": 1.02715302, + "epoch": 0.6704743582035713, + "flos": 26690995216800.0, + "grad_norm": 1.6867814797782796, + "language_loss": 0.69193637, + "learning_rate": 1.0348274990844006e-06, + "loss": 0.71950084, + "num_input_tokens_seen": 120027115, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.33984375, + "step": 5576, + "time_per_iteration": 3.12445068359375 + }, + { + "auxiliary_loss_clip": 0.01482495, + "auxiliary_loss_mlp": 0.0126777, + "balance_loss_clip": 1.12688231, + "balance_loss_mlp": 1.02992415, + "epoch": 0.6705946010942103, + "flos": 23516679954240.0, + "grad_norm": 1.8165497056130726, + "language_loss": 0.72511101, + "learning_rate": 1.034145311198155e-06, + "loss": 0.75261366, + "num_input_tokens_seen": 120047130, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.37304688, + "step": 5577, + "time_per_iteration": 3.1302921772003174 + }, + { + "auxiliary_loss_clip": 0.01482086, + "auxiliary_loss_mlp": 0.01265399, + "balance_loss_clip": 1.1275965, + "balance_loss_mlp": 1.03098583, + "epoch": 0.6707148439848494, + "flos": 24063306060960.0, + "grad_norm": 2.1021256569999553, + "language_loss": 0.63995153, + "learning_rate": 1.0334632698312989e-06, + "loss": 0.66742641, + "num_input_tokens_seen": 120067925, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.33984375, + "step": 5578, + "time_per_iteration": 3.1276276111602783 + }, + { + "auxiliary_loss_clip": 0.01489941, + "auxiliary_loss_mlp": 0.01265046, + "balance_loss_clip": 1.13580132, + "balance_loss_mlp": 1.02910733, + "epoch": 0.6708350868754885, + "flos": 22530879311040.0, + "grad_norm": 1.9650901846521762, + "language_loss": 0.75216621, + "learning_rate": 1.032781375087295e-06, + "loss": 0.77971613, + "num_input_tokens_seen": 120087825, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.35351562, + "step": 5579, + "time_per_iteration": 2.9992921352386475 + }, + { + "auxiliary_loss_clip": 0.01491529, + "auxiliary_loss_mlp": 0.01261534, + "balance_loss_clip": 1.13541484, + "balance_loss_mlp": 1.02406883, + "epoch": 0.6709553297661276, + "flos": 25230063777120.0, + "grad_norm": 1.4544836006443107, + "language_loss": 0.67624944, + "learning_rate": 1.0320996270695891e-06, + "loss": 0.70378006, + "num_input_tokens_seen": 120108895, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.36914062, + "step": 5580, + "time_per_iteration": 3.102916955947876 + }, + { + "auxiliary_loss_clip": 0.01485567, + "auxiliary_loss_mlp": 0.0126347, + "balance_loss_clip": 1.12983561, + "balance_loss_mlp": 1.02676845, + "epoch": 0.6710755726567667, + "flos": 20450840322240.0, + "grad_norm": 1.7514782695333682, + "language_loss": 0.73214793, + "learning_rate": 1.0314180258815998e-06, + "loss": 0.75963831, + "num_input_tokens_seen": 120127535, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.36132812, + "step": 5581, + "time_per_iteration": 2.9231467247009277 + }, + { + "auxiliary_loss_clip": 0.01484935, + "auxiliary_loss_mlp": 0.01261908, + "balance_loss_clip": 1.1282593, + "balance_loss_mlp": 1.02711296, + "epoch": 0.6711958155474057, + "flos": 25997851170720.0, + "grad_norm": 1.6175770477144775, + "language_loss": 0.74167633, + "learning_rate": 1.0307365716267247e-06, + "loss": 0.76914477, + "num_input_tokens_seen": 120147980, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.34179688, + "step": 5582, + "time_per_iteration": 3.8308606147766113 + }, + { + "auxiliary_loss_clip": 0.01484773, + "auxiliary_loss_mlp": 0.0127367, + "balance_loss_clip": 1.12962306, + "balance_loss_mlp": 1.03506112, + "epoch": 0.6713160584380449, + "flos": 19939753265760.0, + "grad_norm": 1.9734913636262477, + "language_loss": 0.78379494, + "learning_rate": 1.0300552644083423e-06, + "loss": 0.81137943, + "num_input_tokens_seen": 120166905, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.38085938, + "step": 5583, + "time_per_iteration": 3.108734369277954 + }, + { + "auxiliary_loss_clip": 0.01483769, + "auxiliary_loss_mlp": 0.01265505, + "balance_loss_clip": 1.12659812, + "balance_loss_mlp": 1.02537, + "epoch": 0.6714363013286839, + "flos": 18225345382560.0, + "grad_norm": 3.99799626219316, + "language_loss": 0.72250712, + "learning_rate": 1.0293741043298036e-06, + "loss": 0.74999982, + "num_input_tokens_seen": 120185255, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.39648438, + "step": 5584, + "time_per_iteration": 2.9306631088256836 + }, + { + "auxiliary_loss_clip": 0.01493049, + "auxiliary_loss_mlp": 0.01270908, + "balance_loss_clip": 1.13746762, + "balance_loss_mlp": 1.03058243, + "epoch": 0.671556544219323, + "flos": 25814770120800.0, + "grad_norm": 2.281849586445527, + "language_loss": 0.71411705, + "learning_rate": 1.0286930914944436e-06, + "loss": 0.74175662, + "num_input_tokens_seen": 120205070, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.3984375, + "step": 5585, + "time_per_iteration": 3.818432092666626 + }, + { + "auxiliary_loss_clip": 0.01472719, + "auxiliary_loss_mlp": 0.01262899, + "balance_loss_clip": 1.11581743, + "balance_loss_mlp": 1.02524304, + "epoch": 0.6716767871099621, + "flos": 15852194658720.0, + "grad_norm": 2.3566688087698457, + "language_loss": 0.77033198, + "learning_rate": 1.0280122260055684e-06, + "loss": 0.79768813, + "num_input_tokens_seen": 120220780, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.37109375, + "step": 5586, + "time_per_iteration": 3.0098111629486084 + }, + { + "auxiliary_loss_clip": 0.01484084, + "auxiliary_loss_mlp": 0.01258846, + "balance_loss_clip": 1.12910128, + "balance_loss_mlp": 1.02042794, + "epoch": 0.6717970300006012, + "flos": 19758189342240.0, + "grad_norm": 1.9936434875376345, + "language_loss": 0.82127553, + "learning_rate": 1.0273315079664652e-06, + "loss": 0.84870481, + "num_input_tokens_seen": 120238735, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.37890625, + "step": 5587, + "time_per_iteration": 3.0839879512786865 + }, + { + "auxiliary_loss_clip": 0.01489859, + "auxiliary_loss_mlp": 0.01272421, + "balance_loss_clip": 1.13507581, + "balance_loss_mlp": 1.03667307, + "epoch": 0.6719172728912403, + "flos": 25487484749280.0, + "grad_norm": 7.239885710746083, + "language_loss": 0.74120551, + "learning_rate": 1.0266509374803992e-06, + "loss": 0.76882839, + "num_input_tokens_seen": 120259895, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.3515625, + "step": 5588, + "time_per_iteration": 3.893664836883545 + }, + { + "auxiliary_loss_clip": 0.01486813, + "auxiliary_loss_mlp": 0.01275396, + "balance_loss_clip": 1.13077092, + "balance_loss_mlp": 1.03850341, + "epoch": 0.6720375157818794, + "flos": 15881741059680.0, + "grad_norm": 2.4138642205866674, + "language_loss": 0.843642, + "learning_rate": 1.0259705146506123e-06, + "loss": 0.8712641, + "num_input_tokens_seen": 120274790, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.36328125, + "step": 5589, + "time_per_iteration": 2.96150541305542 + }, + { + "auxiliary_loss_clip": 0.01486302, + "auxiliary_loss_mlp": 0.01257082, + "balance_loss_clip": 1.13163543, + "balance_loss_mlp": 1.0209527, + "epoch": 0.6721577586725185, + "flos": 32013355387680.0, + "grad_norm": 4.099113806727947, + "language_loss": 0.77378005, + "learning_rate": 1.025290239580324e-06, + "loss": 0.80121386, + "num_input_tokens_seen": 120295460, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.35546875, + "step": 5590, + "time_per_iteration": 3.0774359703063965 + }, + { + "auxiliary_loss_clip": 0.01483299, + "auxiliary_loss_mlp": 0.01275702, + "balance_loss_clip": 1.1269567, + "balance_loss_mlp": 1.03823698, + "epoch": 0.6722780015631575, + "flos": 20739817887840.0, + "grad_norm": 1.8824075941432188, + "language_loss": 0.7585637, + "learning_rate": 1.0246101123727313e-06, + "loss": 0.78615367, + "num_input_tokens_seen": 120314440, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.37109375, + "step": 5591, + "time_per_iteration": 3.0177953243255615 + }, + { + "auxiliary_loss_clip": 0.01483348, + "auxiliary_loss_mlp": 0.01270003, + "balance_loss_clip": 1.12837958, + "balance_loss_mlp": 1.03501773, + "epoch": 0.6723982444537967, + "flos": 16911500804640.0, + "grad_norm": 2.450842220525366, + "language_loss": 0.78842026, + "learning_rate": 1.0239301331310085e-06, + "loss": 0.81595373, + "num_input_tokens_seen": 120332060, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.34375, + "step": 5592, + "time_per_iteration": 3.041809558868408 + }, + { + "auxiliary_loss_clip": 0.01482696, + "auxiliary_loss_mlp": 0.01259428, + "balance_loss_clip": 1.12819457, + "balance_loss_mlp": 1.02596855, + "epoch": 0.6725184873444358, + "flos": 20669915560320.0, + "grad_norm": 1.7791181634787407, + "language_loss": 0.88651741, + "learning_rate": 1.0232503019583088e-06, + "loss": 0.9139387, + "num_input_tokens_seen": 120351670, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.33007812, + "step": 5593, + "time_per_iteration": 2.983334541320801 + }, + { + "auxiliary_loss_clip": 0.01480831, + "auxiliary_loss_mlp": 0.01268574, + "balance_loss_clip": 1.12568784, + "balance_loss_mlp": 1.03358889, + "epoch": 0.6726387302350748, + "flos": 23729610830400.0, + "grad_norm": 1.799165713351836, + "language_loss": 0.69561923, + "learning_rate": 1.0225706189577619e-06, + "loss": 0.7231133, + "num_input_tokens_seen": 120370195, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.34570312, + "step": 5594, + "time_per_iteration": 3.0846476554870605 + }, + { + "auxiliary_loss_clip": 0.01483665, + "auxiliary_loss_mlp": 0.01265466, + "balance_loss_clip": 1.12810016, + "balance_loss_mlp": 1.02952731, + "epoch": 0.672758973125714, + "flos": 15189696930240.0, + "grad_norm": 2.20019027297721, + "language_loss": 0.74854672, + "learning_rate": 1.021891084232475e-06, + "loss": 0.77603799, + "num_input_tokens_seen": 120388130, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.35351562, + "step": 5595, + "time_per_iteration": 2.968324661254883 + }, + { + "auxiliary_loss_clip": 0.01486759, + "auxiliary_loss_mlp": 0.01266969, + "balance_loss_clip": 1.13178217, + "balance_loss_mlp": 1.03064883, + "epoch": 0.672879216016353, + "flos": 18079206724800.0, + "grad_norm": 2.337189538337094, + "language_loss": 0.80315495, + "learning_rate": 1.0212116978855325e-06, + "loss": 0.83069223, + "num_input_tokens_seen": 120406145, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.35742188, + "step": 5596, + "time_per_iteration": 3.8922438621520996 + }, + { + "auxiliary_loss_clip": 0.01481673, + "auxiliary_loss_mlp": 0.01261879, + "balance_loss_clip": 1.12620163, + "balance_loss_mlp": 1.02613032, + "epoch": 0.6729994589069921, + "flos": 23478561789120.0, + "grad_norm": 1.616669224506219, + "language_loss": 0.78768361, + "learning_rate": 1.020532460019997e-06, + "loss": 0.81511915, + "num_input_tokens_seen": 120425395, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.3515625, + "step": 5597, + "time_per_iteration": 3.059830665588379 + }, + { + "auxiliary_loss_clip": 0.01484471, + "auxiliary_loss_mlp": 0.01271522, + "balance_loss_clip": 1.12849808, + "balance_loss_mlp": 1.0334847, + "epoch": 0.6731197017976313, + "flos": 26324377979040.0, + "grad_norm": 1.8416270199273692, + "language_loss": 0.71159339, + "learning_rate": 1.0198533707389096e-06, + "loss": 0.73915327, + "num_input_tokens_seen": 120446270, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.375, + "step": 5598, + "time_per_iteration": 3.052536964416504 + }, + { + "auxiliary_loss_clip": 0.0148727, + "auxiliary_loss_mlp": 0.01255928, + "balance_loss_clip": 1.13195515, + "balance_loss_mlp": 1.01808214, + "epoch": 0.6732399446882703, + "flos": 21618584170560.0, + "grad_norm": 1.8475376256256792, + "language_loss": 0.72945547, + "learning_rate": 1.0191744301452853e-06, + "loss": 0.75688744, + "num_input_tokens_seen": 120465570, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.37304688, + "step": 5599, + "time_per_iteration": 3.089535713195801 + }, + { + "auxiliary_loss_clip": 0.01479541, + "auxiliary_loss_mlp": 0.01258105, + "balance_loss_clip": 1.12505817, + "balance_loss_mlp": 1.02159381, + "epoch": 0.6733601875789094, + "flos": 25882282974240.0, + "grad_norm": 3.264549960905228, + "language_loss": 0.70255828, + "learning_rate": 1.0184956383421208e-06, + "loss": 0.72993469, + "num_input_tokens_seen": 120484220, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.359375, + "step": 5600, + "time_per_iteration": 3.0230319499969482 + }, + { + "auxiliary_loss_clip": 0.01487244, + "auxiliary_loss_mlp": 0.01269003, + "balance_loss_clip": 1.1322248, + "balance_loss_mlp": 1.03153801, + "epoch": 0.6734804304695485, + "flos": 22931783969760.0, + "grad_norm": 3.461175268960319, + "language_loss": 0.655204, + "learning_rate": 1.017816995432387e-06, + "loss": 0.68276644, + "num_input_tokens_seen": 120503320, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.36914062, + "step": 5601, + "time_per_iteration": 3.0092787742614746 + }, + { + "auxiliary_loss_clip": 0.01484553, + "auxiliary_loss_mlp": 0.01260008, + "balance_loss_clip": 1.12790751, + "balance_loss_mlp": 1.02464104, + "epoch": 0.6736006733601876, + "flos": 18699983112960.0, + "grad_norm": 1.9545981341118912, + "language_loss": 0.74895334, + "learning_rate": 1.0171385015190353e-06, + "loss": 0.77639896, + "num_input_tokens_seen": 120523180, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.34765625, + "step": 5602, + "time_per_iteration": 3.0041754245758057 + }, + { + "auxiliary_loss_clip": 0.01483034, + "auxiliary_loss_mlp": 0.01269264, + "balance_loss_clip": 1.12703407, + "balance_loss_mlp": 1.03275251, + "epoch": 0.6737209162508266, + "flos": 19429804054080.0, + "grad_norm": 2.0800237974131988, + "language_loss": 0.73325574, + "learning_rate": 1.0164601567049908e-06, + "loss": 0.76077867, + "num_input_tokens_seen": 120541710, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.359375, + "step": 5603, + "time_per_iteration": 2.892085075378418 + }, + { + "auxiliary_loss_clip": 0.01487125, + "auxiliary_loss_mlp": 0.01273076, + "balance_loss_clip": 1.13111174, + "balance_loss_mlp": 1.03446722, + "epoch": 0.6738411591414658, + "flos": 20160193917600.0, + "grad_norm": 1.8472227292921097, + "language_loss": 0.80226314, + "learning_rate": 1.015781961093158e-06, + "loss": 0.8298651, + "num_input_tokens_seen": 120561030, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.38085938, + "step": 5604, + "time_per_iteration": 2.9990265369415283 + }, + { + "auxiliary_loss_clip": 0.01487477, + "auxiliary_loss_mlp": 0.01255267, + "balance_loss_clip": 1.13263559, + "balance_loss_mlp": 1.02199864, + "epoch": 0.6739614020321049, + "flos": 21656057556960.0, + "grad_norm": 1.5859881915873535, + "language_loss": 0.77293414, + "learning_rate": 1.0151039147864197e-06, + "loss": 0.80036163, + "num_input_tokens_seen": 120581005, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.32617188, + "step": 5605, + "time_per_iteration": 2.9852306842803955 + }, + { + "auxiliary_loss_clip": 0.01491931, + "auxiliary_loss_mlp": 0.01268517, + "balance_loss_clip": 1.13723636, + "balance_loss_mlp": 1.03124273, + "epoch": 0.6740816449227439, + "flos": 19173710567520.0, + "grad_norm": 2.545902146294237, + "language_loss": 0.66620362, + "learning_rate": 1.0144260178876336e-06, + "loss": 0.69380808, + "num_input_tokens_seen": 120600350, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.36914062, + "step": 5606, + "time_per_iteration": 2.9672975540161133 + }, + { + "auxiliary_loss_clip": 0.01493489, + "auxiliary_loss_mlp": 0.01257768, + "balance_loss_clip": 1.13788009, + "balance_loss_mlp": 1.02163851, + "epoch": 0.6742018878133831, + "flos": 21098659852800.0, + "grad_norm": 3.084428548480027, + "language_loss": 0.67253309, + "learning_rate": 1.0137482704996388e-06, + "loss": 0.7000457, + "num_input_tokens_seen": 120614700, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.35546875, + "step": 5607, + "time_per_iteration": 3.0760185718536377 + }, + { + "auxiliary_loss_clip": 0.01498566, + "auxiliary_loss_mlp": 0.01271353, + "balance_loss_clip": 1.14408338, + "balance_loss_mlp": 1.03388786, + "epoch": 0.6743221307040221, + "flos": 23552219004480.0, + "grad_norm": 2.0778578262546454, + "language_loss": 0.78998291, + "learning_rate": 1.0130706727252461e-06, + "loss": 0.81768215, + "num_input_tokens_seen": 120631755, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.36914062, + "step": 5608, + "time_per_iteration": 2.934903621673584 + }, + { + "auxiliary_loss_clip": 0.01491605, + "auxiliary_loss_mlp": 0.01261441, + "balance_loss_clip": 1.1357553, + "balance_loss_mlp": 1.02702761, + "epoch": 0.6744423735946612, + "flos": 16251202909440.0, + "grad_norm": 2.336719432320727, + "language_loss": 0.68693101, + "learning_rate": 1.0123932246672468e-06, + "loss": 0.71446151, + "num_input_tokens_seen": 120645900, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.33789062, + "step": 5609, + "time_per_iteration": 3.1625723838806152 + }, + { + "auxiliary_loss_clip": 0.01440279, + "auxiliary_loss_mlp": 0.01198456, + "balance_loss_clip": 1.10018456, + "balance_loss_mlp": 1.00466919, + "epoch": 0.6745626164853004, + "flos": 57849289421760.0, + "grad_norm": 0.747912651797533, + "language_loss": 0.55762631, + "learning_rate": 1.0117159264284114e-06, + "loss": 0.5840137, + "num_input_tokens_seen": 120709070, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.9375, + "step": 5610, + "time_per_iteration": 4.305769681930542 + }, + { + "auxiliary_loss_clip": 0.01497089, + "auxiliary_loss_mlp": 0.01265409, + "balance_loss_clip": 1.1424489, + "balance_loss_mlp": 1.02813494, + "epoch": 0.6746828593759394, + "flos": 20487100007520.0, + "grad_norm": 1.6556673526171994, + "language_loss": 0.77153873, + "learning_rate": 1.0110387781114837e-06, + "loss": 0.79916376, + "num_input_tokens_seen": 120727685, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.3671875, + "step": 5611, + "time_per_iteration": 3.0344724655151367 + }, + { + "auxiliary_loss_clip": 0.01494097, + "auxiliary_loss_mlp": 0.01268169, + "balance_loss_clip": 1.13904393, + "balance_loss_mlp": 1.03375554, + "epoch": 0.6748031022665785, + "flos": 19210235749920.0, + "grad_norm": 2.1710597593673353, + "language_loss": 0.77488488, + "learning_rate": 1.0103617798191872e-06, + "loss": 0.80250752, + "num_input_tokens_seen": 120747160, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.33984375, + "step": 5612, + "time_per_iteration": 2.9892563819885254 + }, + { + "auxiliary_loss_clip": 0.0149526, + "auxiliary_loss_mlp": 0.01266998, + "balance_loss_clip": 1.1407373, + "balance_loss_mlp": 1.02896047, + "epoch": 0.6749233451572175, + "flos": 15196827424320.0, + "grad_norm": 4.910746134887406, + "language_loss": 0.8248409, + "learning_rate": 1.0096849316542217e-06, + "loss": 0.85246348, + "num_input_tokens_seen": 120763710, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.375, + "step": 5613, + "time_per_iteration": 3.782498836517334 + }, + { + "auxiliary_loss_clip": 0.01490563, + "auxiliary_loss_mlp": 0.01263872, + "balance_loss_clip": 1.13630664, + "balance_loss_mlp": 1.02736056, + "epoch": 0.6750435880478567, + "flos": 26501580164160.0, + "grad_norm": 3.477344345273376, + "language_loss": 0.74731433, + "learning_rate": 1.0090082337192643e-06, + "loss": 0.77485865, + "num_input_tokens_seen": 120783355, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.359375, + "step": 5614, + "time_per_iteration": 2.9995248317718506 + }, + { + "auxiliary_loss_clip": 0.01487697, + "auxiliary_loss_mlp": 0.01267665, + "balance_loss_clip": 1.13332415, + "balance_loss_mlp": 1.03363347, + "epoch": 0.6751638309384957, + "flos": 23406573412800.0, + "grad_norm": 2.596909760206508, + "language_loss": 0.78433102, + "learning_rate": 1.0083316861169705e-06, + "loss": 0.81188464, + "num_input_tokens_seen": 120802090, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.33398438, + "step": 5615, + "time_per_iteration": 3.0250930786132812 + }, + { + "auxiliary_loss_clip": 0.01495951, + "auxiliary_loss_mlp": 0.0126763, + "balance_loss_clip": 1.14135861, + "balance_loss_mlp": 1.02863932, + "epoch": 0.6752840738291348, + "flos": 23443705445760.0, + "grad_norm": 7.92852414507484, + "language_loss": 0.71704668, + "learning_rate": 1.0076552889499713e-06, + "loss": 0.74468255, + "num_input_tokens_seen": 120822855, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.38476562, + "step": 5616, + "time_per_iteration": 3.9001357555389404 + }, + { + "auxiliary_loss_clip": 0.01495427, + "auxiliary_loss_mlp": 0.01271275, + "balance_loss_clip": 1.13982511, + "balance_loss_mlp": 1.0353359, + "epoch": 0.675404316719774, + "flos": 30338431083360.0, + "grad_norm": 2.0809736997065174, + "language_loss": 0.73333925, + "learning_rate": 1.006979042320876e-06, + "loss": 0.76100624, + "num_input_tokens_seen": 120843070, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.35351562, + "step": 5617, + "time_per_iteration": 3.0110719203948975 + }, + { + "auxiliary_loss_clip": 0.01488053, + "auxiliary_loss_mlp": 0.01257999, + "balance_loss_clip": 1.13184714, + "balance_loss_mlp": 1.02186966, + "epoch": 0.675524559610413, + "flos": 23624852159520.0, + "grad_norm": 2.115115340443863, + "language_loss": 0.63031906, + "learning_rate": 1.0063029463322702e-06, + "loss": 0.65777957, + "num_input_tokens_seen": 120863345, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.35546875, + "step": 5618, + "time_per_iteration": 3.044809579849243 + }, + { + "auxiliary_loss_clip": 0.01489432, + "auxiliary_loss_mlp": 0.01259108, + "balance_loss_clip": 1.13402724, + "balance_loss_mlp": 1.02355003, + "epoch": 0.6756448025010521, + "flos": 21250601519040.0, + "grad_norm": 4.592503343310389, + "language_loss": 0.75810647, + "learning_rate": 1.0056270010867164e-06, + "loss": 0.78559184, + "num_input_tokens_seen": 120880915, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.34960938, + "step": 5619, + "time_per_iteration": 2.963693380355835 + }, + { + "auxiliary_loss_clip": 0.01495999, + "auxiliary_loss_mlp": 0.01274329, + "balance_loss_clip": 1.14218068, + "balance_loss_mlp": 1.03495681, + "epoch": 0.6757650453916912, + "flos": 21648320212320.0, + "grad_norm": 3.2171011604858415, + "language_loss": 0.78529382, + "learning_rate": 1.004951206686758e-06, + "loss": 0.8129971, + "num_input_tokens_seen": 120899190, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.38867188, + "step": 5620, + "time_per_iteration": 2.9500467777252197 + }, + { + "auxiliary_loss_clip": 0.01491087, + "auxiliary_loss_mlp": 0.01272449, + "balance_loss_clip": 1.13450241, + "balance_loss_mlp": 1.03383946, + "epoch": 0.6758852882823303, + "flos": 21797644835520.0, + "grad_norm": 2.407288190756135, + "language_loss": 0.71741676, + "learning_rate": 1.0042755632349087e-06, + "loss": 0.7450521, + "num_input_tokens_seen": 120916080, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.38085938, + "step": 5621, + "time_per_iteration": 2.964144468307495 + }, + { + "auxiliary_loss_clip": 0.01491147, + "auxiliary_loss_mlp": 0.01270829, + "balance_loss_clip": 1.13536501, + "balance_loss_mlp": 1.03336453, + "epoch": 0.6760055311729694, + "flos": 27091217168640.0, + "grad_norm": 2.1240928224119013, + "language_loss": 0.62491381, + "learning_rate": 1.0036000708336653e-06, + "loss": 0.65253353, + "num_input_tokens_seen": 120935210, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.36914062, + "step": 5622, + "time_per_iteration": 3.031672954559326 + }, + { + "auxiliary_loss_clip": 0.01496457, + "auxiliary_loss_mlp": 0.01269679, + "balance_loss_clip": 1.1407212, + "balance_loss_mlp": 1.03278625, + "epoch": 0.6761257740636085, + "flos": 18001491196320.0, + "grad_norm": 2.2967027934468742, + "language_loss": 0.79647768, + "learning_rate": 1.0029247295854984e-06, + "loss": 0.82413906, + "num_input_tokens_seen": 120951830, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.36328125, + "step": 5623, + "time_per_iteration": 3.702695608139038 + }, + { + "auxiliary_loss_clip": 0.01498904, + "auxiliary_loss_mlp": 0.01265395, + "balance_loss_clip": 1.14419162, + "balance_loss_mlp": 1.02869296, + "epoch": 0.6762460169542476, + "flos": 15123701203200.0, + "grad_norm": 2.2197268225581093, + "language_loss": 0.72334206, + "learning_rate": 1.0022495395928588e-06, + "loss": 0.75098503, + "num_input_tokens_seen": 120970310, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.36132812, + "step": 5624, + "time_per_iteration": 2.938349962234497 + }, + { + "auxiliary_loss_clip": 0.01448926, + "auxiliary_loss_mlp": 0.01197845, + "balance_loss_clip": 1.10724235, + "balance_loss_mlp": 1.00291443, + "epoch": 0.6763662598448866, + "flos": 67894131791520.0, + "grad_norm": 0.8000773608940593, + "language_loss": 0.62269068, + "learning_rate": 1.0015745009581697e-06, + "loss": 0.64915836, + "num_input_tokens_seen": 121031915, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 1.94921875, + "step": 5625, + "time_per_iteration": 3.4970908164978027 + }, + { + "auxiliary_loss_clip": 0.01497741, + "auxiliary_loss_mlp": 0.01276652, + "balance_loss_clip": 1.14267397, + "balance_loss_mlp": 1.03823352, + "epoch": 0.6764865027355258, + "flos": 20633959300320.0, + "grad_norm": 2.4701220843085285, + "language_loss": 0.66613877, + "learning_rate": 1.0008996137838343e-06, + "loss": 0.6938827, + "num_input_tokens_seen": 121050890, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.37890625, + "step": 5626, + "time_per_iteration": 2.9558534622192383 + }, + { + "auxiliary_loss_clip": 0.01493135, + "auxiliary_loss_mlp": 0.01264629, + "balance_loss_clip": 1.13724375, + "balance_loss_mlp": 1.02602005, + "epoch": 0.6766067456261649, + "flos": 21217982937120.0, + "grad_norm": 1.9412408044985443, + "language_loss": 0.80161643, + "learning_rate": 1.000224878172234e-06, + "loss": 0.82919407, + "num_input_tokens_seen": 121070015, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.38085938, + "step": 5627, + "time_per_iteration": 2.9261574745178223 + }, + { + "auxiliary_loss_clip": 0.01489032, + "auxiliary_loss_mlp": 0.01269474, + "balance_loss_clip": 1.13319993, + "balance_loss_mlp": 1.03200877, + "epoch": 0.6767269885168039, + "flos": 19940322188160.0, + "grad_norm": 2.1586196548802103, + "language_loss": 0.72912014, + "learning_rate": 9.99550294225724e-07, + "loss": 0.75670516, + "num_input_tokens_seen": 121089170, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.36914062, + "step": 5628, + "time_per_iteration": 2.9934465885162354 + }, + { + "auxiliary_loss_clip": 0.01488859, + "auxiliary_loss_mlp": 0.01276595, + "balance_loss_clip": 1.13317561, + "balance_loss_mlp": 1.03779471, + "epoch": 0.6768472314074431, + "flos": 20816205930720.0, + "grad_norm": 2.2955361668227896, + "language_loss": 0.72487152, + "learning_rate": 9.988758620466402e-07, + "loss": 0.75252604, + "num_input_tokens_seen": 121108040, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.3828125, + "step": 5629, + "time_per_iteration": 2.970203399658203 + }, + { + "auxiliary_loss_clip": 0.01493635, + "auxiliary_loss_mlp": 0.01260935, + "balance_loss_clip": 1.13811707, + "balance_loss_mlp": 1.02537727, + "epoch": 0.6769674742980821, + "flos": 23188294666080.0, + "grad_norm": 1.667263785639948, + "language_loss": 0.76199603, + "learning_rate": 9.982015817372917e-07, + "loss": 0.78954178, + "num_input_tokens_seen": 121128480, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.34960938, + "step": 5630, + "time_per_iteration": 3.10129714012146 + }, + { + "auxiliary_loss_clip": 0.01494788, + "auxiliary_loss_mlp": 0.0126982, + "balance_loss_clip": 1.13916993, + "balance_loss_mlp": 1.03311801, + "epoch": 0.6770877171887212, + "flos": 24245211337920.0, + "grad_norm": 1.881468433223063, + "language_loss": 0.81985235, + "learning_rate": 9.975274533999657e-07, + "loss": 0.84749848, + "num_input_tokens_seen": 121148010, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.36132812, + "step": 5631, + "time_per_iteration": 3.0283055305480957 + }, + { + "auxiliary_loss_clip": 0.01497584, + "auxiliary_loss_mlp": 0.01277035, + "balance_loss_clip": 1.14327621, + "balance_loss_mlp": 1.04014182, + "epoch": 0.6772079600793603, + "flos": 18143040546720.0, + "grad_norm": 2.4526933376628524, + "language_loss": 0.84436417, + "learning_rate": 9.96853477136929e-07, + "loss": 0.87211037, + "num_input_tokens_seen": 121162755, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.36328125, + "step": 5632, + "time_per_iteration": 2.990159034729004 + }, + { + "auxiliary_loss_clip": 0.01491135, + "auxiliary_loss_mlp": 0.01269379, + "balance_loss_clip": 1.13537621, + "balance_loss_mlp": 1.0332489, + "epoch": 0.6773282029699994, + "flos": 22454074058400.0, + "grad_norm": 2.693193265210511, + "language_loss": 0.75472176, + "learning_rate": 9.96179653050422e-07, + "loss": 0.78232694, + "num_input_tokens_seen": 121182915, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.35546875, + "step": 5633, + "time_per_iteration": 2.9535417556762695 + }, + { + "auxiliary_loss_clip": 0.01494822, + "auxiliary_loss_mlp": 0.01266199, + "balance_loss_clip": 1.13922465, + "balance_loss_mlp": 1.02854347, + "epoch": 0.6774484458606385, + "flos": 18695507590080.0, + "grad_norm": 3.9016192550193916, + "language_loss": 0.74219131, + "learning_rate": 9.955059812426635e-07, + "loss": 0.76980156, + "num_input_tokens_seen": 121200445, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.37304688, + "step": 5634, + "time_per_iteration": 3.0062620639801025 + }, + { + "auxiliary_loss_clip": 0.01505772, + "auxiliary_loss_mlp": 0.01276511, + "balance_loss_clip": 1.15146089, + "balance_loss_mlp": 1.03847396, + "epoch": 0.6775686887512776, + "flos": 25996409900640.0, + "grad_norm": 3.070505476048047, + "language_loss": 0.83014894, + "learning_rate": 9.948324618158493e-07, + "loss": 0.85797167, + "num_input_tokens_seen": 121220785, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.375, + "step": 5635, + "time_per_iteration": 3.063831090927124 + }, + { + "auxiliary_loss_clip": 0.01496047, + "auxiliary_loss_mlp": 0.01275733, + "balance_loss_clip": 1.139624, + "balance_loss_mlp": 1.03597975, + "epoch": 0.6776889316419167, + "flos": 13589605614240.0, + "grad_norm": 2.7657706790491687, + "language_loss": 0.77969867, + "learning_rate": 9.941590948721502e-07, + "loss": 0.80741644, + "num_input_tokens_seen": 121237985, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.39257812, + "step": 5636, + "time_per_iteration": 2.9888315200805664 + }, + { + "auxiliary_loss_clip": 0.01491146, + "auxiliary_loss_mlp": 0.01265063, + "balance_loss_clip": 1.13599253, + "balance_loss_mlp": 1.02893305, + "epoch": 0.6778091745325557, + "flos": 27603669638880.0, + "grad_norm": 1.8990599627393903, + "language_loss": 0.76288807, + "learning_rate": 9.934858805137188e-07, + "loss": 0.79045022, + "num_input_tokens_seen": 121258635, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.35742188, + "step": 5637, + "time_per_iteration": 3.039547920227051 + }, + { + "auxiliary_loss_clip": 0.01493174, + "auxiliary_loss_mlp": 0.01262636, + "balance_loss_clip": 1.13748169, + "balance_loss_mlp": 1.0266968, + "epoch": 0.6779294174231949, + "flos": 18736053157440.0, + "grad_norm": 4.904210024099947, + "language_loss": 0.80912369, + "learning_rate": 9.92812818842677e-07, + "loss": 0.83668178, + "num_input_tokens_seen": 121277810, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.35546875, + "step": 5638, + "time_per_iteration": 3.8419156074523926 + }, + { + "auxiliary_loss_clip": 0.01483948, + "auxiliary_loss_mlp": 0.01265293, + "balance_loss_clip": 1.12647367, + "balance_loss_mlp": 1.02820933, + "epoch": 0.678049660313834, + "flos": 45876994872480.0, + "grad_norm": 2.2146640996070617, + "language_loss": 0.63964725, + "learning_rate": 9.921399099611306e-07, + "loss": 0.66713965, + "num_input_tokens_seen": 121298975, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.36523438, + "step": 5639, + "time_per_iteration": 3.1107606887817383 + }, + { + "auxiliary_loss_clip": 0.01488083, + "auxiliary_loss_mlp": 0.01267702, + "balance_loss_clip": 1.13315892, + "balance_loss_mlp": 1.03099966, + "epoch": 0.678169903204473, + "flos": 19976619801600.0, + "grad_norm": 1.678575049937491, + "language_loss": 0.69077826, + "learning_rate": 9.914671539711588e-07, + "loss": 0.71833611, + "num_input_tokens_seen": 121318495, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.36132812, + "step": 5640, + "time_per_iteration": 2.943145513534546 + }, + { + "auxiliary_loss_clip": 0.01493026, + "auxiliary_loss_mlp": 0.01267458, + "balance_loss_clip": 1.13656175, + "balance_loss_mlp": 1.03056562, + "epoch": 0.6782901460951122, + "flos": 21397650452640.0, + "grad_norm": 3.2271169122540826, + "language_loss": 0.78672791, + "learning_rate": 9.90794550974817e-07, + "loss": 0.81433272, + "num_input_tokens_seen": 121338890, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.36328125, + "step": 5641, + "time_per_iteration": 3.7435905933380127 + }, + { + "auxiliary_loss_clip": 0.01488719, + "auxiliary_loss_mlp": 0.01279101, + "balance_loss_clip": 1.13186669, + "balance_loss_mlp": 1.03915632, + "epoch": 0.6784103889857512, + "flos": 21436147899360.0, + "grad_norm": 2.5284617265703013, + "language_loss": 0.80939716, + "learning_rate": 9.901221010741407e-07, + "loss": 0.83707535, + "num_input_tokens_seen": 121358210, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.39453125, + "step": 5642, + "time_per_iteration": 2.949130058288574 + }, + { + "auxiliary_loss_clip": 0.01495919, + "auxiliary_loss_mlp": 0.0127845, + "balance_loss_clip": 1.13897443, + "balance_loss_mlp": 1.04155695, + "epoch": 0.6785306318763903, + "flos": 32674715271360.0, + "grad_norm": 3.8935557504187503, + "language_loss": 0.74975777, + "learning_rate": 9.894498043711375e-07, + "loss": 0.77750146, + "num_input_tokens_seen": 121379955, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.36328125, + "step": 5643, + "time_per_iteration": 3.8096578121185303 + }, + { + "auxiliary_loss_clip": 0.01486371, + "auxiliary_loss_mlp": 0.01262286, + "balance_loss_clip": 1.12999892, + "balance_loss_mlp": 1.02806354, + "epoch": 0.6786508747670293, + "flos": 25634647467360.0, + "grad_norm": 2.0284653695235213, + "language_loss": 0.69437528, + "learning_rate": 9.887776609677962e-07, + "loss": 0.72186178, + "num_input_tokens_seen": 121401325, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3359375, + "step": 5644, + "time_per_iteration": 2.9989423751831055 + }, + { + "auxiliary_loss_clip": 0.01485478, + "auxiliary_loss_mlp": 0.0126623, + "balance_loss_clip": 1.12816978, + "balance_loss_mlp": 1.03143573, + "epoch": 0.6787711176576685, + "flos": 19173900208320.0, + "grad_norm": 1.8919676932763676, + "language_loss": 0.72546029, + "learning_rate": 9.88105670966079e-07, + "loss": 0.75297737, + "num_input_tokens_seen": 121419785, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.34179688, + "step": 5645, + "time_per_iteration": 3.0387611389160156 + }, + { + "auxiliary_loss_clip": 0.01490704, + "auxiliary_loss_mlp": 0.0126451, + "balance_loss_clip": 1.13402414, + "balance_loss_mlp": 1.03009653, + "epoch": 0.6788913605483076, + "flos": 13986565744320.0, + "grad_norm": 2.181940411475456, + "language_loss": 0.78997993, + "learning_rate": 9.874338344679283e-07, + "loss": 0.81753218, + "num_input_tokens_seen": 121435630, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.33789062, + "step": 5646, + "time_per_iteration": 2.9644742012023926 + }, + { + "auxiliary_loss_clip": 0.01490324, + "auxiliary_loss_mlp": 0.01262343, + "balance_loss_clip": 1.13270688, + "balance_loss_mlp": 1.02773857, + "epoch": 0.6790116034389466, + "flos": 22019754326400.0, + "grad_norm": 1.745389705644491, + "language_loss": 0.74472386, + "learning_rate": 9.86762151575259e-07, + "loss": 0.77225053, + "num_input_tokens_seen": 121455625, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.33984375, + "step": 5647, + "time_per_iteration": 2.9507768154144287 + }, + { + "auxiliary_loss_clip": 0.01488148, + "auxiliary_loss_mlp": 0.01254684, + "balance_loss_clip": 1.12982988, + "balance_loss_mlp": 1.0202713, + "epoch": 0.6791318463295858, + "flos": 20924529848640.0, + "grad_norm": 1.474926025989104, + "language_loss": 0.80465615, + "learning_rate": 9.860906223899651e-07, + "loss": 0.83208454, + "num_input_tokens_seen": 121475020, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.33789062, + "step": 5648, + "time_per_iteration": 3.121286392211914 + }, + { + "auxiliary_loss_clip": 0.01488126, + "auxiliary_loss_mlp": 0.01264678, + "balance_loss_clip": 1.13127506, + "balance_loss_mlp": 1.02683187, + "epoch": 0.6792520892202248, + "flos": 28515320000640.0, + "grad_norm": 4.691407096239184, + "language_loss": 0.75941128, + "learning_rate": 9.854192470139184e-07, + "loss": 0.78693926, + "num_input_tokens_seen": 121496500, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.37304688, + "step": 5649, + "time_per_iteration": 3.1056339740753174 + }, + { + "auxiliary_loss_clip": 0.01492725, + "auxiliary_loss_mlp": 0.01259741, + "balance_loss_clip": 1.13764238, + "balance_loss_mlp": 1.02151346, + "epoch": 0.6793723321108639, + "flos": 20014017331680.0, + "grad_norm": 8.678847661143877, + "language_loss": 0.7167002, + "learning_rate": 9.847480255489645e-07, + "loss": 0.74422485, + "num_input_tokens_seen": 121515525, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.37695312, + "step": 5650, + "time_per_iteration": 3.896855354309082 + }, + { + "auxiliary_loss_clip": 0.01493697, + "auxiliary_loss_mlp": 0.01261249, + "balance_loss_clip": 1.13675809, + "balance_loss_mlp": 1.02588236, + "epoch": 0.6794925750015031, + "flos": 26651815063200.0, + "grad_norm": 1.6853033254704908, + "language_loss": 0.69263268, + "learning_rate": 9.840769580969295e-07, + "loss": 0.72018218, + "num_input_tokens_seen": 121535965, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.34765625, + "step": 5651, + "time_per_iteration": 3.317218542098999 + }, + { + "auxiliary_loss_clip": 0.01490229, + "auxiliary_loss_mlp": 0.01262234, + "balance_loss_clip": 1.13317728, + "balance_loss_mlp": 1.02667665, + "epoch": 0.6796128178921421, + "flos": 21582552054240.0, + "grad_norm": 1.9735410665280526, + "language_loss": 0.80206573, + "learning_rate": 9.834060447596114e-07, + "loss": 0.82959032, + "num_input_tokens_seen": 121555235, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.34960938, + "step": 5652, + "time_per_iteration": 3.0482962131500244 + }, + { + "auxiliary_loss_clip": 0.01492102, + "auxiliary_loss_mlp": 0.01271334, + "balance_loss_clip": 1.13428652, + "balance_loss_mlp": 1.03634906, + "epoch": 0.6797330607827812, + "flos": 22494202416000.0, + "grad_norm": 2.2127441702131447, + "language_loss": 0.78053796, + "learning_rate": 9.827352856387868e-07, + "loss": 0.80817235, + "num_input_tokens_seen": 121574945, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.34375, + "step": 5653, + "time_per_iteration": 3.033381938934326 + }, + { + "auxiliary_loss_clip": 0.01445796, + "auxiliary_loss_mlp": 0.01203094, + "balance_loss_clip": 1.10556006, + "balance_loss_mlp": 1.01045227, + "epoch": 0.6798533036734203, + "flos": 66313080776160.0, + "grad_norm": 0.774664712036819, + "language_loss": 0.64208817, + "learning_rate": 9.820646808362118e-07, + "loss": 0.66857708, + "num_input_tokens_seen": 121641200, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.92578125, + "step": 5654, + "time_per_iteration": 3.5652987957000732 + }, + { + "auxiliary_loss_clip": 0.01490455, + "auxiliary_loss_mlp": 0.01256201, + "balance_loss_clip": 1.13372493, + "balance_loss_mlp": 1.02159691, + "epoch": 0.6799735465640594, + "flos": 16182021216960.0, + "grad_norm": 2.9509857103216457, + "language_loss": 0.72926849, + "learning_rate": 9.813942304536154e-07, + "loss": 0.75673503, + "num_input_tokens_seen": 121659170, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.34179688, + "step": 5655, + "time_per_iteration": 3.009330987930298 + }, + { + "auxiliary_loss_clip": 0.01493061, + "auxiliary_loss_mlp": 0.012651, + "balance_loss_clip": 1.13657188, + "balance_loss_mlp": 1.02897, + "epoch": 0.6800937894546984, + "flos": 22127850675360.0, + "grad_norm": 1.8552160859248368, + "language_loss": 0.63627398, + "learning_rate": 9.807239345927043e-07, + "loss": 0.66385567, + "num_input_tokens_seen": 121679180, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.35546875, + "step": 5656, + "time_per_iteration": 3.1278576850891113 + }, + { + "auxiliary_loss_clip": 0.01487304, + "auxiliary_loss_mlp": 0.01263818, + "balance_loss_clip": 1.13099527, + "balance_loss_mlp": 1.0273068, + "epoch": 0.6802140323453376, + "flos": 31615636694400.0, + "grad_norm": 2.3658009164547855, + "language_loss": 0.71639299, + "learning_rate": 9.80053793355162e-07, + "loss": 0.74390417, + "num_input_tokens_seen": 121697875, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.359375, + "step": 5657, + "time_per_iteration": 3.119426727294922 + }, + { + "auxiliary_loss_clip": 0.01491711, + "auxiliary_loss_mlp": 0.0126378, + "balance_loss_clip": 1.13612986, + "balance_loss_mlp": 1.02517056, + "epoch": 0.6803342752359767, + "flos": 17714903104800.0, + "grad_norm": 2.40716875850872, + "language_loss": 0.74544358, + "learning_rate": 9.793838068426472e-07, + "loss": 0.77299851, + "num_input_tokens_seen": 121715570, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.38085938, + "step": 5658, + "time_per_iteration": 2.9410746097564697 + }, + { + "auxiliary_loss_clip": 0.01493459, + "auxiliary_loss_mlp": 0.01260297, + "balance_loss_clip": 1.13642609, + "balance_loss_mlp": 1.02283168, + "epoch": 0.6804545181266157, + "flos": 11328344055360.0, + "grad_norm": 4.170942216406692, + "language_loss": 0.61260331, + "learning_rate": 9.78713975156799e-07, + "loss": 0.64014089, + "num_input_tokens_seen": 121731435, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.37109375, + "step": 5659, + "time_per_iteration": 2.97836971282959 + }, + { + "auxiliary_loss_clip": 0.01494659, + "auxiliary_loss_mlp": 0.01270036, + "balance_loss_clip": 1.13769686, + "balance_loss_mlp": 1.03142619, + "epoch": 0.6805747610172549, + "flos": 29353540716000.0, + "grad_norm": 2.085776807474136, + "language_loss": 0.7193715, + "learning_rate": 9.780442983992273e-07, + "loss": 0.7470184, + "num_input_tokens_seen": 121749950, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.38085938, + "step": 5660, + "time_per_iteration": 2.9603257179260254 + }, + { + "auxiliary_loss_clip": 0.0149326, + "auxiliary_loss_mlp": 0.01269031, + "balance_loss_clip": 1.13685274, + "balance_loss_mlp": 1.03232884, + "epoch": 0.680695003907894, + "flos": 37634819942880.0, + "grad_norm": 1.8475143335676607, + "language_loss": 0.71802956, + "learning_rate": 9.773747766715238e-07, + "loss": 0.7456525, + "num_input_tokens_seen": 121770770, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.36132812, + "step": 5661, + "time_per_iteration": 3.0842742919921875 + }, + { + "auxiliary_loss_clip": 0.0148627, + "auxiliary_loss_mlp": 0.01259006, + "balance_loss_clip": 1.12933588, + "balance_loss_mlp": 1.02096868, + "epoch": 0.680815246798533, + "flos": 22129557442560.0, + "grad_norm": 2.1902453351301987, + "language_loss": 0.80080092, + "learning_rate": 9.767054100752536e-07, + "loss": 0.82825369, + "num_input_tokens_seen": 121790720, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.375, + "step": 5662, + "time_per_iteration": 2.982814311981201 + }, + { + "auxiliary_loss_clip": 0.01495605, + "auxiliary_loss_mlp": 0.01266087, + "balance_loss_clip": 1.13938427, + "balance_loss_mlp": 1.02881312, + "epoch": 0.6809354896891722, + "flos": 17203740192000.0, + "grad_norm": 2.0734891615572835, + "language_loss": 0.82062685, + "learning_rate": 9.760361987119584e-07, + "loss": 0.84824371, + "num_input_tokens_seen": 121808455, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3671875, + "step": 5663, + "time_per_iteration": 3.005859136581421 + }, + { + "auxiliary_loss_clip": 0.01486572, + "auxiliary_loss_mlp": 0.01254868, + "balance_loss_clip": 1.12987185, + "balance_loss_mlp": 1.01816595, + "epoch": 0.6810557325798112, + "flos": 12459790290240.0, + "grad_norm": 2.9366257187055327, + "language_loss": 0.67940134, + "learning_rate": 9.753671426831592e-07, + "loss": 0.70681572, + "num_input_tokens_seen": 121824470, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.36132812, + "step": 5664, + "time_per_iteration": 3.0948269367218018 + }, + { + "auxiliary_loss_clip": 0.01487797, + "auxiliary_loss_mlp": 0.01262683, + "balance_loss_clip": 1.13067651, + "balance_loss_mlp": 1.02445483, + "epoch": 0.6811759754704503, + "flos": 22157928070560.0, + "grad_norm": 2.0659867655433586, + "language_loss": 0.79890347, + "learning_rate": 9.746982420903483e-07, + "loss": 0.82640821, + "num_input_tokens_seen": 121842665, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.37695312, + "step": 5665, + "time_per_iteration": 3.023340940475464 + }, + { + "auxiliary_loss_clip": 0.01493437, + "auxiliary_loss_mlp": 0.01264186, + "balance_loss_clip": 1.13926196, + "balance_loss_mlp": 1.02824676, + "epoch": 0.6812962183610894, + "flos": 17527043106720.0, + "grad_norm": 2.1572264096913134, + "language_loss": 0.75122827, + "learning_rate": 9.740294970349993e-07, + "loss": 0.77880448, + "num_input_tokens_seen": 121859080, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.35351562, + "step": 5666, + "time_per_iteration": 3.8442399501800537 + }, + { + "auxiliary_loss_clip": 0.01443844, + "auxiliary_loss_mlp": 0.01192963, + "balance_loss_clip": 1.1041311, + "balance_loss_mlp": 1.00108337, + "epoch": 0.6814164612517285, + "flos": 60279902036640.0, + "grad_norm": 0.922892509242016, + "language_loss": 0.60862958, + "learning_rate": 9.733609076185594e-07, + "loss": 0.63499767, + "num_input_tokens_seen": 121915485, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 1.91796875, + "step": 5667, + "time_per_iteration": 3.4344005584716797 + }, + { + "auxiliary_loss_clip": 0.01492661, + "auxiliary_loss_mlp": 0.01264204, + "balance_loss_clip": 1.13837981, + "balance_loss_mlp": 1.0273118, + "epoch": 0.6815367041423676, + "flos": 19319773368960.0, + "grad_norm": 2.1713836137308404, + "language_loss": 0.8422305, + "learning_rate": 9.72692473942455e-07, + "loss": 0.86979914, + "num_input_tokens_seen": 121932710, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.36328125, + "step": 5668, + "time_per_iteration": 4.0485146045684814 + }, + { + "auxiliary_loss_clip": 0.01486093, + "auxiliary_loss_mlp": 0.01268988, + "balance_loss_clip": 1.13023329, + "balance_loss_mlp": 1.03152311, + "epoch": 0.6816569470330067, + "flos": 22163769007200.0, + "grad_norm": 1.5174386916896663, + "language_loss": 0.77640128, + "learning_rate": 9.720241961080849e-07, + "loss": 0.8039521, + "num_input_tokens_seen": 121952025, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.36914062, + "step": 5669, + "time_per_iteration": 3.124555826187134 + }, + { + "auxiliary_loss_clip": 0.01482868, + "auxiliary_loss_mlp": 0.01265242, + "balance_loss_clip": 1.12776518, + "balance_loss_mlp": 1.03006589, + "epoch": 0.6817771899236458, + "flos": 41466702273120.0, + "grad_norm": 2.5090374381523195, + "language_loss": 0.73049021, + "learning_rate": 9.713560742168259e-07, + "loss": 0.75797135, + "num_input_tokens_seen": 121974650, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.34570312, + "step": 5670, + "time_per_iteration": 3.974261522293091 + }, + { + "auxiliary_loss_clip": 0.01491226, + "auxiliary_loss_mlp": 0.0127257, + "balance_loss_clip": 1.13648033, + "balance_loss_mlp": 1.03815722, + "epoch": 0.6818974328142848, + "flos": 21108293605440.0, + "grad_norm": 2.645932713383785, + "language_loss": 0.71330786, + "learning_rate": 9.706881083700333e-07, + "loss": 0.74094582, + "num_input_tokens_seen": 121994335, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.33789062, + "step": 5671, + "time_per_iteration": 3.1126089096069336 + }, + { + "auxiliary_loss_clip": 0.01490113, + "auxiliary_loss_mlp": 0.01265506, + "balance_loss_clip": 1.13607764, + "balance_loss_mlp": 1.0280416, + "epoch": 0.682017675704924, + "flos": 20443596043680.0, + "grad_norm": 2.7063188249980374, + "language_loss": 0.82827598, + "learning_rate": 9.700202986690357e-07, + "loss": 0.85583216, + "num_input_tokens_seen": 122012635, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.36914062, + "step": 5672, + "time_per_iteration": 3.013869285583496 + }, + { + "auxiliary_loss_clip": 0.01489189, + "auxiliary_loss_mlp": 0.01276453, + "balance_loss_clip": 1.1345706, + "balance_loss_mlp": 1.03860629, + "epoch": 0.682137918595563, + "flos": 20046332488320.0, + "grad_norm": 2.0851665079649737, + "language_loss": 0.66747332, + "learning_rate": 9.693526452151413e-07, + "loss": 0.69512969, + "num_input_tokens_seen": 122031685, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.37304688, + "step": 5673, + "time_per_iteration": 3.1075057983398438 + }, + { + "auxiliary_loss_clip": 0.01490963, + "auxiliary_loss_mlp": 0.01276937, + "balance_loss_clip": 1.13564169, + "balance_loss_mlp": 1.03775525, + "epoch": 0.6822581614862021, + "flos": 31687169932800.0, + "grad_norm": 1.936040541737174, + "language_loss": 0.75630462, + "learning_rate": 9.686851481096305e-07, + "loss": 0.78398359, + "num_input_tokens_seen": 122052995, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.38671875, + "step": 5674, + "time_per_iteration": 3.098996639251709 + }, + { + "auxiliary_loss_clip": 0.0148778, + "auxiliary_loss_mlp": 0.01268764, + "balance_loss_clip": 1.13237381, + "balance_loss_mlp": 1.03129923, + "epoch": 0.6823784043768413, + "flos": 23479851346560.0, + "grad_norm": 2.549192042463164, + "language_loss": 0.72110212, + "learning_rate": 9.68017807453762e-07, + "loss": 0.7486676, + "num_input_tokens_seen": 122071740, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.36914062, + "step": 5675, + "time_per_iteration": 3.1070594787597656 + }, + { + "auxiliary_loss_clip": 0.0148866, + "auxiliary_loss_mlp": 0.01264093, + "balance_loss_clip": 1.13419926, + "balance_loss_mlp": 1.03025174, + "epoch": 0.6824986472674803, + "flos": 14138886692160.0, + "grad_norm": 3.51801872571002, + "language_loss": 0.7364105, + "learning_rate": 9.673506233487721e-07, + "loss": 0.76393795, + "num_input_tokens_seen": 122089705, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.3359375, + "step": 5676, + "time_per_iteration": 3.0891082286834717 + }, + { + "auxiliary_loss_clip": 0.01484928, + "auxiliary_loss_mlp": 0.01264784, + "balance_loss_clip": 1.12987161, + "balance_loss_mlp": 1.0332315, + "epoch": 0.6826188901581194, + "flos": 21507036359040.0, + "grad_norm": 2.5210734627361617, + "language_loss": 0.86025244, + "learning_rate": 9.666835958958717e-07, + "loss": 0.88774955, + "num_input_tokens_seen": 122109025, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.3125, + "step": 5677, + "time_per_iteration": 3.0049448013305664 + }, + { + "auxiliary_loss_clip": 0.01495991, + "auxiliary_loss_mlp": 0.01264548, + "balance_loss_clip": 1.14173794, + "balance_loss_mlp": 1.02822721, + "epoch": 0.6827391330487584, + "flos": 20812033833120.0, + "grad_norm": 2.197068685843976, + "language_loss": 0.80719566, + "learning_rate": 9.660167251962484e-07, + "loss": 0.83480096, + "num_input_tokens_seen": 122127385, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.35742188, + "step": 5678, + "time_per_iteration": 3.9341492652893066 + }, + { + "auxiliary_loss_clip": 0.01491238, + "auxiliary_loss_mlp": 0.01274319, + "balance_loss_clip": 1.13544822, + "balance_loss_mlp": 1.04200423, + "epoch": 0.6828593759393976, + "flos": 21690913900320.0, + "grad_norm": 1.6350778754561857, + "language_loss": 0.78018993, + "learning_rate": 9.653500113510654e-07, + "loss": 0.80784547, + "num_input_tokens_seen": 122146500, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.3203125, + "step": 5679, + "time_per_iteration": 3.027899742126465 + }, + { + "auxiliary_loss_clip": 0.01493925, + "auxiliary_loss_mlp": 0.01264064, + "balance_loss_clip": 1.13955903, + "balance_loss_mlp": 1.02850699, + "epoch": 0.6829796188300367, + "flos": 25340056534080.0, + "grad_norm": 2.310167952664068, + "language_loss": 0.67301297, + "learning_rate": 9.646834544614627e-07, + "loss": 0.70059288, + "num_input_tokens_seen": 122167000, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.34960938, + "step": 5680, + "time_per_iteration": 3.033836841583252 + }, + { + "auxiliary_loss_clip": 0.0149723, + "auxiliary_loss_mlp": 0.01267029, + "balance_loss_clip": 1.14275777, + "balance_loss_mlp": 1.02918255, + "epoch": 0.6830998617206757, + "flos": 20706933808800.0, + "grad_norm": 2.0405313185384237, + "language_loss": 0.76136392, + "learning_rate": 9.64017054628558e-07, + "loss": 0.78900647, + "num_input_tokens_seen": 122185825, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.37304688, + "step": 5681, + "time_per_iteration": 2.9552087783813477 + }, + { + "auxiliary_loss_clip": 0.0148415, + "auxiliary_loss_mlp": 0.0125934, + "balance_loss_clip": 1.12842989, + "balance_loss_mlp": 1.02340055, + "epoch": 0.6832201046113149, + "flos": 21728842424640.0, + "grad_norm": 1.8262071509511528, + "language_loss": 0.79039776, + "learning_rate": 9.63350811953441e-07, + "loss": 0.81783271, + "num_input_tokens_seen": 122206200, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.35546875, + "step": 5682, + "time_per_iteration": 3.005333423614502 + }, + { + "auxiliary_loss_clip": 0.01492236, + "auxiliary_loss_mlp": 0.01259414, + "balance_loss_clip": 1.13697326, + "balance_loss_mlp": 1.02538228, + "epoch": 0.6833403475019539, + "flos": 19538583109920.0, + "grad_norm": 2.312506863282516, + "language_loss": 0.70629156, + "learning_rate": 9.626847265371826e-07, + "loss": 0.73380804, + "num_input_tokens_seen": 122225520, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.33398438, + "step": 5683, + "time_per_iteration": 3.007946491241455 + }, + { + "auxiliary_loss_clip": 0.01489113, + "auxiliary_loss_mlp": 0.01256515, + "balance_loss_clip": 1.13303554, + "balance_loss_mlp": 1.02114797, + "epoch": 0.683460590392593, + "flos": 19354060789920.0, + "grad_norm": 6.888943041566417, + "language_loss": 0.78982902, + "learning_rate": 9.620187984808262e-07, + "loss": 0.8172853, + "num_input_tokens_seen": 122244320, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.34960938, + "step": 5684, + "time_per_iteration": 2.9570488929748535 + }, + { + "auxiliary_loss_clip": 0.01488956, + "auxiliary_loss_mlp": 0.01268611, + "balance_loss_clip": 1.13242364, + "balance_loss_mlp": 1.0341984, + "epoch": 0.6835808332832322, + "flos": 23290398365760.0, + "grad_norm": 11.565922552014477, + "language_loss": 0.86112094, + "learning_rate": 9.613530278853919e-07, + "loss": 0.88869667, + "num_input_tokens_seen": 122264295, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.33984375, + "step": 5685, + "time_per_iteration": 3.0018088817596436 + }, + { + "auxiliary_loss_clip": 0.01488934, + "auxiliary_loss_mlp": 0.0127135, + "balance_loss_clip": 1.13265526, + "balance_loss_mlp": 1.03407633, + "epoch": 0.6837010761738712, + "flos": 21655602419040.0, + "grad_norm": 2.2155196927480576, + "language_loss": 0.74333113, + "learning_rate": 9.60687414851879e-07, + "loss": 0.77093399, + "num_input_tokens_seen": 122285300, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3671875, + "step": 5686, + "time_per_iteration": 3.0457112789154053 + }, + { + "auxiliary_loss_clip": 0.01491056, + "auxiliary_loss_mlp": 0.01269834, + "balance_loss_clip": 1.13439429, + "balance_loss_mlp": 1.03370404, + "epoch": 0.6838213190645103, + "flos": 17568309309120.0, + "grad_norm": 2.146663832828332, + "language_loss": 0.77298975, + "learning_rate": 9.600219594812575e-07, + "loss": 0.80059862, + "num_input_tokens_seen": 122303240, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.35546875, + "step": 5687, + "time_per_iteration": 3.047022819519043 + }, + { + "auxiliary_loss_clip": 0.01494328, + "auxiliary_loss_mlp": 0.01263049, + "balance_loss_clip": 1.13876116, + "balance_loss_mlp": 1.02787244, + "epoch": 0.6839415619551494, + "flos": 23114789163360.0, + "grad_norm": 1.566235798664605, + "language_loss": 0.72598314, + "learning_rate": 9.593566618744786e-07, + "loss": 0.75355691, + "num_input_tokens_seen": 122323390, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.34570312, + "step": 5688, + "time_per_iteration": 3.0991904735565186 + }, + { + "auxiliary_loss_clip": 0.0149043, + "auxiliary_loss_mlp": 0.012578, + "balance_loss_clip": 1.13515294, + "balance_loss_mlp": 1.02434015, + "epoch": 0.6840618048457885, + "flos": 22130088436800.0, + "grad_norm": 2.456962123647895, + "language_loss": 0.74327868, + "learning_rate": 9.58691522132466e-07, + "loss": 0.77076089, + "num_input_tokens_seen": 122342200, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.33007812, + "step": 5689, + "time_per_iteration": 3.168593406677246 + }, + { + "auxiliary_loss_clip": 0.01487294, + "auxiliary_loss_mlp": 0.01271646, + "balance_loss_clip": 1.13058972, + "balance_loss_mlp": 1.03246462, + "epoch": 0.6841820477364275, + "flos": 22017971702880.0, + "grad_norm": 1.994183264296311, + "language_loss": 0.84547019, + "learning_rate": 9.58026540356123e-07, + "loss": 0.87305963, + "num_input_tokens_seen": 122360465, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.38671875, + "step": 5690, + "time_per_iteration": 3.0770111083984375 + }, + { + "auxiliary_loss_clip": 0.01490002, + "auxiliary_loss_mlp": 0.01274406, + "balance_loss_clip": 1.13197112, + "balance_loss_mlp": 1.03694081, + "epoch": 0.6843022906270667, + "flos": 24902892190080.0, + "grad_norm": 1.680782450799321, + "language_loss": 0.8649714, + "learning_rate": 9.573617166463246e-07, + "loss": 0.89261544, + "num_input_tokens_seen": 122381680, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.36914062, + "step": 5691, + "time_per_iteration": 3.1152658462524414 + }, + { + "auxiliary_loss_clip": 0.01493806, + "auxiliary_loss_mlp": 0.01266431, + "balance_loss_clip": 1.13717961, + "balance_loss_mlp": 1.03011096, + "epoch": 0.6844225335177058, + "flos": 19971727068960.0, + "grad_norm": 2.6393497504883565, + "language_loss": 0.6015265, + "learning_rate": 9.56697051103924e-07, + "loss": 0.62912893, + "num_input_tokens_seen": 122399120, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.35742188, + "step": 5692, + "time_per_iteration": 3.1768202781677246 + }, + { + "auxiliary_loss_clip": 0.01486257, + "auxiliary_loss_mlp": 0.01258223, + "balance_loss_clip": 1.12982297, + "balance_loss_mlp": 1.022475, + "epoch": 0.6845427764083448, + "flos": 25885620652320.0, + "grad_norm": 4.010108931535455, + "language_loss": 0.81300592, + "learning_rate": 9.560325438297522e-07, + "loss": 0.84045076, + "num_input_tokens_seen": 122417430, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3515625, + "step": 5693, + "time_per_iteration": 4.029575347900391 + }, + { + "auxiliary_loss_clip": 0.01495888, + "auxiliary_loss_mlp": 0.01261053, + "balance_loss_clip": 1.14010358, + "balance_loss_mlp": 1.02759409, + "epoch": 0.684663019298984, + "flos": 18882153887040.0, + "grad_norm": 2.299143008443294, + "language_loss": 0.87213361, + "learning_rate": 9.553681949246127e-07, + "loss": 0.89970303, + "num_input_tokens_seen": 122435055, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.33203125, + "step": 5694, + "time_per_iteration": 2.9990322589874268 + }, + { + "auxiliary_loss_clip": 0.01497516, + "auxiliary_loss_mlp": 0.01266469, + "balance_loss_clip": 1.14171994, + "balance_loss_mlp": 1.02728689, + "epoch": 0.684783262189623, + "flos": 54197454252960.0, + "grad_norm": 2.9222594112101334, + "language_loss": 0.7562685, + "learning_rate": 9.547040044892886e-07, + "loss": 0.78390837, + "num_input_tokens_seen": 122462570, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.38671875, + "step": 5695, + "time_per_iteration": 3.3156840801239014 + }, + { + "auxiliary_loss_clip": 0.01453156, + "auxiliary_loss_mlp": 0.01201996, + "balance_loss_clip": 1.11060381, + "balance_loss_mlp": 1.00744629, + "epoch": 0.6849035050802621, + "flos": 63976493162880.0, + "grad_norm": 0.8640534031772448, + "language_loss": 0.60023534, + "learning_rate": 9.540399726245354e-07, + "loss": 0.62678695, + "num_input_tokens_seen": 122519275, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 1.9453125, + "step": 5696, + "time_per_iteration": 4.253336668014526 + }, + { + "auxiliary_loss_clip": 0.01491925, + "auxiliary_loss_mlp": 0.01270946, + "balance_loss_clip": 1.13579345, + "balance_loss_mlp": 1.03290915, + "epoch": 0.6850237479709013, + "flos": 25226233032960.0, + "grad_norm": 2.048831274735196, + "language_loss": 0.69121569, + "learning_rate": 9.533760994310859e-07, + "loss": 0.71884441, + "num_input_tokens_seen": 122539675, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.375, + "step": 5697, + "time_per_iteration": 3.155982255935669 + }, + { + "auxiliary_loss_clip": 0.01491731, + "auxiliary_loss_mlp": 0.01268229, + "balance_loss_clip": 1.13429224, + "balance_loss_mlp": 1.03286266, + "epoch": 0.6851439908615403, + "flos": 19356146838720.0, + "grad_norm": 2.368397116750106, + "language_loss": 0.75690973, + "learning_rate": 9.527123850096508e-07, + "loss": 0.7845093, + "num_input_tokens_seen": 122558035, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.34765625, + "step": 5698, + "time_per_iteration": 3.9159624576568604 + }, + { + "auxiliary_loss_clip": 0.01491705, + "auxiliary_loss_mlp": 0.01266432, + "balance_loss_clip": 1.13519478, + "balance_loss_mlp": 1.03087473, + "epoch": 0.6852642337521794, + "flos": 23184198424800.0, + "grad_norm": 2.203832954824271, + "language_loss": 0.71779591, + "learning_rate": 9.520488294609142e-07, + "loss": 0.7453773, + "num_input_tokens_seen": 122576815, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.3515625, + "step": 5699, + "time_per_iteration": 2.998398542404175 + }, + { + "auxiliary_loss_clip": 0.01452619, + "auxiliary_loss_mlp": 0.01207985, + "balance_loss_clip": 1.11118901, + "balance_loss_mlp": 1.01381683, + "epoch": 0.6853844766428185, + "flos": 62652825191520.0, + "grad_norm": 0.8330844452539548, + "language_loss": 0.53783399, + "learning_rate": 9.513854328855368e-07, + "loss": 0.56444001, + "num_input_tokens_seen": 122634690, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.94140625, + "step": 5700, + "time_per_iteration": 3.280996799468994 + }, + { + "auxiliary_loss_clip": 0.01489318, + "auxiliary_loss_mlp": 0.01254071, + "balance_loss_clip": 1.13473892, + "balance_loss_mlp": 1.01965761, + "epoch": 0.6855047195334576, + "flos": 23439305779200.0, + "grad_norm": 2.1156736386259545, + "language_loss": 0.8128581, + "learning_rate": 9.507221953841558e-07, + "loss": 0.84029198, + "num_input_tokens_seen": 122652320, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.33984375, + "step": 5701, + "time_per_iteration": 2.9990346431732178 + }, + { + "auxiliary_loss_clip": 0.01495631, + "auxiliary_loss_mlp": 0.01263869, + "balance_loss_clip": 1.13951635, + "balance_loss_mlp": 1.02850199, + "epoch": 0.6856249624240967, + "flos": 20666767523040.0, + "grad_norm": 1.6152008859405385, + "language_loss": 0.77812219, + "learning_rate": 9.500591170573824e-07, + "loss": 0.80571717, + "num_input_tokens_seen": 122672340, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.34960938, + "step": 5702, + "time_per_iteration": 3.1717565059661865 + }, + { + "auxiliary_loss_clip": 0.01488989, + "auxiliary_loss_mlp": 0.012661, + "balance_loss_clip": 1.13228166, + "balance_loss_mlp": 1.02729952, + "epoch": 0.6857452053147358, + "flos": 17088930558720.0, + "grad_norm": 1.9789118231378946, + "language_loss": 0.7435441, + "learning_rate": 9.493961980058078e-07, + "loss": 0.77109504, + "num_input_tokens_seen": 122689935, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.3828125, + "step": 5703, + "time_per_iteration": 3.002788543701172 + }, + { + "auxiliary_loss_clip": 0.01486767, + "auxiliary_loss_mlp": 0.01258018, + "balance_loss_clip": 1.13082254, + "balance_loss_mlp": 1.02150726, + "epoch": 0.6858654482053749, + "flos": 30849935349600.0, + "grad_norm": 3.5328497666477543, + "language_loss": 0.67964995, + "learning_rate": 9.48733438329993e-07, + "loss": 0.70709789, + "num_input_tokens_seen": 122710200, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.359375, + "step": 5704, + "time_per_iteration": 3.1653971672058105 + }, + { + "auxiliary_loss_clip": 0.01489014, + "auxiliary_loss_mlp": 0.01263675, + "balance_loss_clip": 1.13198984, + "balance_loss_mlp": 1.03155017, + "epoch": 0.6859856910960139, + "flos": 28879889117760.0, + "grad_norm": 1.7076251808851017, + "language_loss": 0.74108452, + "learning_rate": 9.480708381304807e-07, + "loss": 0.76861143, + "num_input_tokens_seen": 122731495, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.31640625, + "step": 5705, + "time_per_iteration": 3.8859663009643555 + }, + { + "auxiliary_loss_clip": 0.01494031, + "auxiliary_loss_mlp": 0.01256728, + "balance_loss_clip": 1.13756871, + "balance_loss_mlp": 1.02078938, + "epoch": 0.6861059339866531, + "flos": 19356564048480.0, + "grad_norm": 2.321618923095687, + "language_loss": 0.83559823, + "learning_rate": 9.474083975077858e-07, + "loss": 0.86310577, + "num_input_tokens_seen": 122748620, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.35351562, + "step": 5706, + "time_per_iteration": 3.0000648498535156 + }, + { + "auxiliary_loss_clip": 0.01490946, + "auxiliary_loss_mlp": 0.01256127, + "balance_loss_clip": 1.1337353, + "balance_loss_mlp": 1.02076006, + "epoch": 0.6862261768772921, + "flos": 22202152669440.0, + "grad_norm": 2.854865046356027, + "language_loss": 0.80430162, + "learning_rate": 9.467461165623994e-07, + "loss": 0.83177233, + "num_input_tokens_seen": 122767670, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.34765625, + "step": 5707, + "time_per_iteration": 3.055459499359131 + }, + { + "auxiliary_loss_clip": 0.01494601, + "auxiliary_loss_mlp": 0.01262242, + "balance_loss_clip": 1.13864541, + "balance_loss_mlp": 1.02687526, + "epoch": 0.6863464197679312, + "flos": 26288042437440.0, + "grad_norm": 2.1166035614105363, + "language_loss": 0.79460293, + "learning_rate": 9.46083995394791e-07, + "loss": 0.82217139, + "num_input_tokens_seen": 122785480, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.34765625, + "step": 5708, + "time_per_iteration": 3.0065577030181885 + }, + { + "auxiliary_loss_clip": 0.01490101, + "auxiliary_loss_mlp": 0.01266684, + "balance_loss_clip": 1.13407898, + "balance_loss_mlp": 1.0347507, + "epoch": 0.6864666626585703, + "flos": 37818242346240.0, + "grad_norm": 3.0237918876790593, + "language_loss": 0.63380706, + "learning_rate": 9.454220341054012e-07, + "loss": 0.66137493, + "num_input_tokens_seen": 122810265, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.31640625, + "step": 5709, + "time_per_iteration": 3.1321094036102295 + }, + { + "auxiliary_loss_clip": 0.01491305, + "auxiliary_loss_mlp": 0.01256099, + "balance_loss_clip": 1.13548911, + "balance_loss_mlp": 1.02092242, + "epoch": 0.6865869055492094, + "flos": 19393203015360.0, + "grad_norm": 2.8586233576477658, + "language_loss": 0.80840778, + "learning_rate": 9.447602327946512e-07, + "loss": 0.83588183, + "num_input_tokens_seen": 122828905, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.34570312, + "step": 5710, + "time_per_iteration": 3.079176425933838 + }, + { + "auxiliary_loss_clip": 0.01495002, + "auxiliary_loss_mlp": 0.01262672, + "balance_loss_clip": 1.14084589, + "balance_loss_mlp": 1.02520716, + "epoch": 0.6867071484398485, + "flos": 20378055454560.0, + "grad_norm": 2.042906023391396, + "language_loss": 0.76528102, + "learning_rate": 9.440985915629338e-07, + "loss": 0.79285777, + "num_input_tokens_seen": 122846235, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.36914062, + "step": 5711, + "time_per_iteration": 3.009685754776001 + }, + { + "auxiliary_loss_clip": 0.01489575, + "auxiliary_loss_mlp": 0.01265248, + "balance_loss_clip": 1.13383174, + "balance_loss_mlp": 1.03236115, + "epoch": 0.6868273913304875, + "flos": 15891033458880.0, + "grad_norm": 2.1928754793288197, + "language_loss": 0.73825026, + "learning_rate": 9.434371105106223e-07, + "loss": 0.76579845, + "num_input_tokens_seen": 122863835, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.32617188, + "step": 5712, + "time_per_iteration": 3.039634943008423 + }, + { + "auxiliary_loss_clip": 0.01488808, + "auxiliary_loss_mlp": 0.01262708, + "balance_loss_clip": 1.13380051, + "balance_loss_mlp": 1.02581477, + "epoch": 0.6869476342211267, + "flos": 24464817570240.0, + "grad_norm": 1.9278978665779105, + "language_loss": 0.70427001, + "learning_rate": 9.427757897380602e-07, + "loss": 0.73178512, + "num_input_tokens_seen": 122883235, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.36523438, + "step": 5713, + "time_per_iteration": 2.959664821624756 + }, + { + "auxiliary_loss_clip": 0.01485228, + "auxiliary_loss_mlp": 0.01261443, + "balance_loss_clip": 1.1290884, + "balance_loss_mlp": 1.02416873, + "epoch": 0.6870678771117658, + "flos": 18444572333280.0, + "grad_norm": 2.143022377819459, + "language_loss": 0.85024047, + "learning_rate": 9.421146293455695e-07, + "loss": 0.87770706, + "num_input_tokens_seen": 122898975, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.3671875, + "step": 5714, + "time_per_iteration": 2.9985811710357666 + }, + { + "auxiliary_loss_clip": 0.01499321, + "auxiliary_loss_mlp": 0.01263958, + "balance_loss_clip": 1.14576721, + "balance_loss_mlp": 1.02592087, + "epoch": 0.6871881200024048, + "flos": 22202569879200.0, + "grad_norm": 1.8404908615854676, + "language_loss": 0.68274784, + "learning_rate": 9.414536294334489e-07, + "loss": 0.71038067, + "num_input_tokens_seen": 122918995, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.375, + "step": 5715, + "time_per_iteration": 3.1006956100463867 + }, + { + "auxiliary_loss_clip": 0.01488968, + "auxiliary_loss_mlp": 0.0125576, + "balance_loss_clip": 1.13339043, + "balance_loss_mlp": 1.02077484, + "epoch": 0.687308362893044, + "flos": 22129936724160.0, + "grad_norm": 2.045046645099858, + "language_loss": 0.69855779, + "learning_rate": 9.407927901019708e-07, + "loss": 0.72600508, + "num_input_tokens_seen": 122938125, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.34570312, + "step": 5716, + "time_per_iteration": 2.9711077213287354 + }, + { + "auxiliary_loss_clip": 0.01487969, + "auxiliary_loss_mlp": 0.012582, + "balance_loss_clip": 1.13170815, + "balance_loss_mlp": 1.0224514, + "epoch": 0.687428605783683, + "flos": 25042583060640.0, + "grad_norm": 2.2838466399022836, + "language_loss": 0.76698548, + "learning_rate": 9.401321114513854e-07, + "loss": 0.79444718, + "num_input_tokens_seen": 122957020, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.3515625, + "step": 5717, + "time_per_iteration": 2.96870493888855 + }, + { + "auxiliary_loss_clip": 0.01490672, + "auxiliary_loss_mlp": 0.01270371, + "balance_loss_clip": 1.13450789, + "balance_loss_mlp": 1.03405023, + "epoch": 0.6875488486743221, + "flos": 23772394159200.0, + "grad_norm": 1.8276265406437435, + "language_loss": 0.75663686, + "learning_rate": 9.394715935819155e-07, + "loss": 0.78424728, + "num_input_tokens_seen": 122977410, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.35742188, + "step": 5718, + "time_per_iteration": 2.948631525039673 + }, + { + "auxiliary_loss_clip": 0.01495582, + "auxiliary_loss_mlp": 0.01263483, + "balance_loss_clip": 1.14044273, + "balance_loss_mlp": 1.02411056, + "epoch": 0.6876690915649613, + "flos": 25519117199040.0, + "grad_norm": 3.6183287372589734, + "language_loss": 0.62702084, + "learning_rate": 9.388112365937608e-07, + "loss": 0.65461147, + "num_input_tokens_seen": 122996875, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.38867188, + "step": 5719, + "time_per_iteration": 2.9454686641693115 + }, + { + "auxiliary_loss_clip": 0.01497535, + "auxiliary_loss_mlp": 0.0125901, + "balance_loss_clip": 1.14206719, + "balance_loss_mlp": 1.02307129, + "epoch": 0.6877893344556003, + "flos": 19430069551200.0, + "grad_norm": 2.202743454831537, + "language_loss": 0.82397926, + "learning_rate": 9.381510405870985e-07, + "loss": 0.85154474, + "num_input_tokens_seen": 123015890, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.35351562, + "step": 5720, + "time_per_iteration": 3.7684872150421143 + }, + { + "auxiliary_loss_clip": 0.01498127, + "auxiliary_loss_mlp": 0.01262728, + "balance_loss_clip": 1.14310575, + "balance_loss_mlp": 1.02736127, + "epoch": 0.6879095773462394, + "flos": 18663306217920.0, + "grad_norm": 2.319961264904563, + "language_loss": 0.77500141, + "learning_rate": 9.374910056620791e-07, + "loss": 0.80260998, + "num_input_tokens_seen": 123034955, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.34765625, + "step": 5721, + "time_per_iteration": 2.936683177947998 + }, + { + "auxiliary_loss_clip": 0.01488688, + "auxiliary_loss_mlp": 0.01275207, + "balance_loss_clip": 1.13171399, + "balance_loss_mlp": 1.03755188, + "epoch": 0.6880298202368785, + "flos": 20885046269760.0, + "grad_norm": 1.7476358519994155, + "language_loss": 0.81420493, + "learning_rate": 9.368311319188293e-07, + "loss": 0.8418439, + "num_input_tokens_seen": 123052770, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.37109375, + "step": 5722, + "time_per_iteration": 2.9802560806274414 + }, + { + "auxiliary_loss_clip": 0.01490419, + "auxiliary_loss_mlp": 0.01266216, + "balance_loss_clip": 1.13494575, + "balance_loss_mlp": 1.03065836, + "epoch": 0.6881500631275176, + "flos": 30155881027680.0, + "grad_norm": 2.2673366983191894, + "language_loss": 0.79410964, + "learning_rate": 9.361714194574515e-07, + "loss": 0.82167602, + "num_input_tokens_seen": 123075105, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.34960938, + "step": 5723, + "time_per_iteration": 2.9985358715057373 + }, + { + "auxiliary_loss_clip": 0.01454933, + "auxiliary_loss_mlp": 0.01200653, + "balance_loss_clip": 1.11449242, + "balance_loss_mlp": 1.0038147, + "epoch": 0.6882703060181566, + "flos": 66189130092000.0, + "grad_norm": 0.8048122032663912, + "language_loss": 0.58273041, + "learning_rate": 9.355118683780228e-07, + "loss": 0.60928631, + "num_input_tokens_seen": 123145175, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.96484375, + "step": 5724, + "time_per_iteration": 4.445208787918091 + }, + { + "auxiliary_loss_clip": 0.01489607, + "auxiliary_loss_mlp": 0.01262598, + "balance_loss_clip": 1.1327796, + "balance_loss_mlp": 1.02723098, + "epoch": 0.6883905489087958, + "flos": 18216128839680.0, + "grad_norm": 2.138757611123417, + "language_loss": 0.79262364, + "learning_rate": 9.348524787805987e-07, + "loss": 0.82014573, + "num_input_tokens_seen": 123160365, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.34765625, + "step": 5725, + "time_per_iteration": 3.859862804412842 + }, + { + "auxiliary_loss_clip": 0.01488203, + "auxiliary_loss_mlp": 0.01271195, + "balance_loss_clip": 1.1317991, + "balance_loss_mlp": 1.03506517, + "epoch": 0.6885107917994349, + "flos": 14057567988480.0, + "grad_norm": 2.6512221819168467, + "language_loss": 0.85126781, + "learning_rate": 9.341932507652053e-07, + "loss": 0.87886178, + "num_input_tokens_seen": 123174855, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.35546875, + "step": 5726, + "time_per_iteration": 2.960376501083374 + }, + { + "auxiliary_loss_clip": 0.01488006, + "auxiliary_loss_mlp": 0.01269118, + "balance_loss_clip": 1.13067269, + "balance_loss_mlp": 1.03489614, + "epoch": 0.6886310346900739, + "flos": 28693053180000.0, + "grad_norm": 2.4935815864242183, + "language_loss": 0.78764105, + "learning_rate": 9.335341844318489e-07, + "loss": 0.81521231, + "num_input_tokens_seen": 123194995, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.33789062, + "step": 5727, + "time_per_iteration": 2.9582347869873047 + }, + { + "auxiliary_loss_clip": 0.01491432, + "auxiliary_loss_mlp": 0.0126656, + "balance_loss_clip": 1.13572896, + "balance_loss_mlp": 1.03023911, + "epoch": 0.6887512775807131, + "flos": 24537640366080.0, + "grad_norm": 2.303643716051257, + "language_loss": 0.73217762, + "learning_rate": 9.328752798805091e-07, + "loss": 0.75975752, + "num_input_tokens_seen": 123213465, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.35742188, + "step": 5728, + "time_per_iteration": 2.9725496768951416 + }, + { + "auxiliary_loss_clip": 0.01489145, + "auxiliary_loss_mlp": 0.01270528, + "balance_loss_clip": 1.13239539, + "balance_loss_mlp": 1.0326817, + "epoch": 0.6888715204713521, + "flos": 22416297246720.0, + "grad_norm": 2.168545289472234, + "language_loss": 0.76375067, + "learning_rate": 9.322165372111399e-07, + "loss": 0.79134738, + "num_input_tokens_seen": 123231610, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.37304688, + "step": 5729, + "time_per_iteration": 2.955087184906006 + }, + { + "auxiliary_loss_clip": 0.01492017, + "auxiliary_loss_mlp": 0.01259447, + "balance_loss_clip": 1.13607216, + "balance_loss_mlp": 1.02407992, + "epoch": 0.6889917633619912, + "flos": 22056507077760.0, + "grad_norm": 1.9471502646841483, + "language_loss": 0.75233114, + "learning_rate": 9.315579565236747e-07, + "loss": 0.77984577, + "num_input_tokens_seen": 123250715, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.34765625, + "step": 5730, + "time_per_iteration": 2.9394235610961914 + }, + { + "auxiliary_loss_clip": 0.01489618, + "auxiliary_loss_mlp": 0.0126414, + "balance_loss_clip": 1.13320351, + "balance_loss_mlp": 1.0285821, + "epoch": 0.6891120062526304, + "flos": 23951909962080.0, + "grad_norm": 3.1468807776511056, + "language_loss": 0.74532193, + "learning_rate": 9.308995379180162e-07, + "loss": 0.77285945, + "num_input_tokens_seen": 123270270, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.34960938, + "step": 5731, + "time_per_iteration": 3.0608911514282227 + }, + { + "auxiliary_loss_clip": 0.0145478, + "auxiliary_loss_mlp": 0.01198746, + "balance_loss_clip": 1.11397195, + "balance_loss_mlp": 1.00572205, + "epoch": 0.6892322491432694, + "flos": 64123655880960.0, + "grad_norm": 0.7429037479189262, + "language_loss": 0.59472078, + "learning_rate": 9.302412814940488e-07, + "loss": 0.62125605, + "num_input_tokens_seen": 123333045, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.9296875, + "step": 5732, + "time_per_iteration": 3.539715051651001 + }, + { + "auxiliary_loss_clip": 0.01492911, + "auxiliary_loss_mlp": 0.0126561, + "balance_loss_clip": 1.13690424, + "balance_loss_mlp": 1.03157854, + "epoch": 0.6893524920339085, + "flos": 23004644693760.0, + "grad_norm": 3.4705142418529764, + "language_loss": 0.71454495, + "learning_rate": 9.295831873516276e-07, + "loss": 0.7421301, + "num_input_tokens_seen": 123352320, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.3359375, + "step": 5733, + "time_per_iteration": 3.9778027534484863 + }, + { + "auxiliary_loss_clip": 0.01492974, + "auxiliary_loss_mlp": 0.01258695, + "balance_loss_clip": 1.1359483, + "balance_loss_mlp": 1.02256513, + "epoch": 0.6894727349245476, + "flos": 21398333159520.0, + "grad_norm": 1.904510807359968, + "language_loss": 0.76124197, + "learning_rate": 9.289252555905873e-07, + "loss": 0.78875864, + "num_input_tokens_seen": 123372400, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.35546875, + "step": 5734, + "time_per_iteration": 3.0404744148254395 + }, + { + "auxiliary_loss_clip": 0.01496089, + "auxiliary_loss_mlp": 0.01276033, + "balance_loss_clip": 1.14067066, + "balance_loss_mlp": 1.03952217, + "epoch": 0.6895929778151867, + "flos": 19867385607840.0, + "grad_norm": 1.9898279674396697, + "language_loss": 0.76103622, + "learning_rate": 9.282674863107334e-07, + "loss": 0.78875744, + "num_input_tokens_seen": 123390215, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.359375, + "step": 5735, + "time_per_iteration": 2.98222017288208 + }, + { + "auxiliary_loss_clip": 0.014912, + "auxiliary_loss_mlp": 0.0127199, + "balance_loss_clip": 1.13468659, + "balance_loss_mlp": 1.03471565, + "epoch": 0.6897132207058257, + "flos": 18180703573920.0, + "grad_norm": 2.9803698462326875, + "language_loss": 0.76225102, + "learning_rate": 9.276098796118488e-07, + "loss": 0.7898829, + "num_input_tokens_seen": 123406700, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.36914062, + "step": 5736, + "time_per_iteration": 2.974454402923584 + }, + { + "auxiliary_loss_clip": 0.01490961, + "auxiliary_loss_mlp": 0.01254824, + "balance_loss_clip": 1.13539982, + "balance_loss_mlp": 1.02098274, + "epoch": 0.6898334635964649, + "flos": 32564381160960.0, + "grad_norm": 1.6769492175123277, + "language_loss": 0.66162449, + "learning_rate": 9.269524355936938e-07, + "loss": 0.68908238, + "num_input_tokens_seen": 123429880, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.33398438, + "step": 5737, + "time_per_iteration": 3.0150930881500244 + }, + { + "auxiliary_loss_clip": 0.01487257, + "auxiliary_loss_mlp": 0.01259829, + "balance_loss_clip": 1.13177514, + "balance_loss_mlp": 1.02598834, + "epoch": 0.689953706487104, + "flos": 22821146434080.0, + "grad_norm": 3.8619235484627894, + "language_loss": 0.84864724, + "learning_rate": 9.262951543560002e-07, + "loss": 0.87611806, + "num_input_tokens_seen": 123449105, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.33398438, + "step": 5738, + "time_per_iteration": 2.948115348815918 + }, + { + "auxiliary_loss_clip": 0.01495028, + "auxiliary_loss_mlp": 0.01265039, + "balance_loss_clip": 1.13854778, + "balance_loss_mlp": 1.02700162, + "epoch": 0.690073949377743, + "flos": 18517357200960.0, + "grad_norm": 3.1322087244041574, + "language_loss": 0.86698604, + "learning_rate": 9.256380359984795e-07, + "loss": 0.89458668, + "num_input_tokens_seen": 123466215, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.375, + "step": 5739, + "time_per_iteration": 2.962820053100586 + }, + { + "auxiliary_loss_clip": 0.01498604, + "auxiliary_loss_mlp": 0.01276364, + "balance_loss_clip": 1.14315224, + "balance_loss_mlp": 1.0390898, + "epoch": 0.6901941922683821, + "flos": 34859892212640.0, + "grad_norm": 1.9989020835068154, + "language_loss": 0.74711198, + "learning_rate": 9.249810806208139e-07, + "loss": 0.77486169, + "num_input_tokens_seen": 123485480, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.3671875, + "step": 5740, + "time_per_iteration": 3.0547659397125244 + }, + { + "auxiliary_loss_clip": 0.01490405, + "auxiliary_loss_mlp": 0.01258864, + "balance_loss_clip": 1.13593531, + "balance_loss_mlp": 1.02483177, + "epoch": 0.6903144351590212, + "flos": 16255564647840.0, + "grad_norm": 2.436521353892491, + "language_loss": 0.80330539, + "learning_rate": 9.243242883226627e-07, + "loss": 0.83079803, + "num_input_tokens_seen": 123504575, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.33398438, + "step": 5741, + "time_per_iteration": 2.9114017486572266 + }, + { + "auxiliary_loss_clip": 0.01488125, + "auxiliary_loss_mlp": 0.01271476, + "balance_loss_clip": 1.13203549, + "balance_loss_mlp": 1.03649104, + "epoch": 0.6904346780496603, + "flos": 28037496304800.0, + "grad_norm": 2.2277181994306803, + "language_loss": 0.69545555, + "learning_rate": 9.236676592036628e-07, + "loss": 0.72305155, + "num_input_tokens_seen": 123524250, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.34375, + "step": 5742, + "time_per_iteration": 2.9971988201141357 + }, + { + "auxiliary_loss_clip": 0.01496712, + "auxiliary_loss_mlp": 0.01261853, + "balance_loss_clip": 1.141366, + "balance_loss_mlp": 1.02763104, + "epoch": 0.6905549209402994, + "flos": 23626596854880.0, + "grad_norm": 1.854594953681046, + "language_loss": 0.73811895, + "learning_rate": 9.230111933634228e-07, + "loss": 0.76570463, + "num_input_tokens_seen": 123545845, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.3359375, + "step": 5743, + "time_per_iteration": 2.9526822566986084 + }, + { + "auxiliary_loss_clip": 0.01495431, + "auxiliary_loss_mlp": 0.0126165, + "balance_loss_clip": 1.14060092, + "balance_loss_mlp": 1.02551997, + "epoch": 0.6906751638309385, + "flos": 23117330350080.0, + "grad_norm": 2.275682267604985, + "language_loss": 0.80982769, + "learning_rate": 9.223548909015288e-07, + "loss": 0.83739853, + "num_input_tokens_seen": 123567535, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.35546875, + "step": 5744, + "time_per_iteration": 2.9993703365325928 + }, + { + "auxiliary_loss_clip": 0.01492042, + "auxiliary_loss_mlp": 0.01262064, + "balance_loss_clip": 1.13704228, + "balance_loss_mlp": 1.02650642, + "epoch": 0.6907954067215776, + "flos": 27307789148160.0, + "grad_norm": 2.620733893153266, + "language_loss": 0.72412479, + "learning_rate": 9.216987519175407e-07, + "loss": 0.75166583, + "num_input_tokens_seen": 123587710, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.3515625, + "step": 5745, + "time_per_iteration": 3.0409369468688965 + }, + { + "auxiliary_loss_clip": 0.0149104, + "auxiliary_loss_mlp": 0.01267299, + "balance_loss_clip": 1.13657641, + "balance_loss_mlp": 1.03288615, + "epoch": 0.6909156496122166, + "flos": 21691558679040.0, + "grad_norm": 1.857332159394956, + "language_loss": 0.68656683, + "learning_rate": 9.210427765109942e-07, + "loss": 0.71415019, + "num_input_tokens_seen": 123607385, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.33984375, + "step": 5746, + "time_per_iteration": 2.9535937309265137 + }, + { + "auxiliary_loss_clip": 0.01490294, + "auxiliary_loss_mlp": 0.01275596, + "balance_loss_clip": 1.1344924, + "balance_loss_mlp": 1.03546071, + "epoch": 0.6910358925028558, + "flos": 22563459964800.0, + "grad_norm": 2.046309351073513, + "language_loss": 0.81343621, + "learning_rate": 9.20386964781402e-07, + "loss": 0.84109515, + "num_input_tokens_seen": 123625405, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.39648438, + "step": 5747, + "time_per_iteration": 3.8437533378601074 + }, + { + "auxiliary_loss_clip": 0.01491194, + "auxiliary_loss_mlp": 0.01268595, + "balance_loss_clip": 1.13748384, + "balance_loss_mlp": 1.03341901, + "epoch": 0.6911561353934949, + "flos": 22056279508800.0, + "grad_norm": 2.199770498468408, + "language_loss": 0.84730124, + "learning_rate": 9.197313168282472e-07, + "loss": 0.87489909, + "num_input_tokens_seen": 123642850, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.34765625, + "step": 5748, + "time_per_iteration": 3.2466347217559814 + }, + { + "auxiliary_loss_clip": 0.01490433, + "auxiliary_loss_mlp": 0.0125751, + "balance_loss_clip": 1.13531148, + "balance_loss_mlp": 1.02042639, + "epoch": 0.6912763782841339, + "flos": 24208837868160.0, + "grad_norm": 2.3068920819222565, + "language_loss": 0.73149592, + "learning_rate": 9.190758327509935e-07, + "loss": 0.75897533, + "num_input_tokens_seen": 123661595, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.36523438, + "step": 5749, + "time_per_iteration": 2.988290309906006 + }, + { + "auxiliary_loss_clip": 0.0145604, + "auxiliary_loss_mlp": 0.01196915, + "balance_loss_clip": 1.11605895, + "balance_loss_mlp": 1.00465393, + "epoch": 0.6913966211747731, + "flos": 52335124724160.0, + "grad_norm": 0.948588206909575, + "language_loss": 0.64378291, + "learning_rate": 9.184205126490767e-07, + "loss": 0.67031252, + "num_input_tokens_seen": 123710490, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.921875, + "step": 5750, + "time_per_iteration": 3.3293139934539795 + }, + { + "auxiliary_loss_clip": 0.01456912, + "auxiliary_loss_mlp": 0.01193985, + "balance_loss_clip": 1.11678922, + "balance_loss_mlp": 1.00210571, + "epoch": 0.6915168640654121, + "flos": 66747552220800.0, + "grad_norm": 1.123059379914834, + "language_loss": 0.59558499, + "learning_rate": 9.177653566219075e-07, + "loss": 0.62209398, + "num_input_tokens_seen": 123765215, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.91796875, + "step": 5751, + "time_per_iteration": 4.211596250534058 + }, + { + "auxiliary_loss_clip": 0.01490457, + "auxiliary_loss_mlp": 0.01262184, + "balance_loss_clip": 1.13534951, + "balance_loss_mlp": 1.02624476, + "epoch": 0.6916371069560512, + "flos": 18298509531840.0, + "grad_norm": 2.3722252451249792, + "language_loss": 0.76188195, + "learning_rate": 9.171103647688744e-07, + "loss": 0.78940833, + "num_input_tokens_seen": 123783955, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.35546875, + "step": 5752, + "time_per_iteration": 4.071911334991455 + }, + { + "auxiliary_loss_clip": 0.01491452, + "auxiliary_loss_mlp": 0.0126618, + "balance_loss_clip": 1.13704157, + "balance_loss_mlp": 1.03062248, + "epoch": 0.6917573498466904, + "flos": 19647817303680.0, + "grad_norm": 2.704890247612784, + "language_loss": 0.69272053, + "learning_rate": 9.164555371893367e-07, + "loss": 0.72029686, + "num_input_tokens_seen": 123803885, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.34960938, + "step": 5753, + "time_per_iteration": 2.9781765937805176 + }, + { + "auxiliary_loss_clip": 0.01494682, + "auxiliary_loss_mlp": 0.0126954, + "balance_loss_clip": 1.14044666, + "balance_loss_mlp": 1.03474545, + "epoch": 0.6918775927373294, + "flos": 14212088769600.0, + "grad_norm": 2.0086331868096727, + "language_loss": 0.75358343, + "learning_rate": 9.158008739826333e-07, + "loss": 0.78122568, + "num_input_tokens_seen": 123821485, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.34179688, + "step": 5754, + "time_per_iteration": 3.044351100921631 + }, + { + "auxiliary_loss_clip": 0.01490862, + "auxiliary_loss_mlp": 0.01272495, + "balance_loss_clip": 1.13573313, + "balance_loss_mlp": 1.03617489, + "epoch": 0.6919978356279685, + "flos": 23987600724960.0, + "grad_norm": 1.5709127701713452, + "language_loss": 0.86557215, + "learning_rate": 9.151463752480744e-07, + "loss": 0.89320576, + "num_input_tokens_seen": 123840215, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.35742188, + "step": 5755, + "time_per_iteration": 2.9910662174224854 + }, + { + "auxiliary_loss_clip": 0.01488084, + "auxiliary_loss_mlp": 0.01260869, + "balance_loss_clip": 1.1331706, + "balance_loss_mlp": 1.02645636, + "epoch": 0.6921180785186076, + "flos": 23625193512960.0, + "grad_norm": 1.4392726320665208, + "language_loss": 0.80494118, + "learning_rate": 9.144920410849493e-07, + "loss": 0.83243066, + "num_input_tokens_seen": 123861450, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.33984375, + "step": 5756, + "time_per_iteration": 2.992673873901367 + }, + { + "auxiliary_loss_clip": 0.01496794, + "auxiliary_loss_mlp": 0.01264827, + "balance_loss_clip": 1.14144957, + "balance_loss_mlp": 1.03079581, + "epoch": 0.6922383214092467, + "flos": 21144780859680.0, + "grad_norm": 1.803703756749711, + "language_loss": 0.80134261, + "learning_rate": 9.138378715925176e-07, + "loss": 0.82895881, + "num_input_tokens_seen": 123880545, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.33398438, + "step": 5757, + "time_per_iteration": 2.988267421722412 + }, + { + "auxiliary_loss_clip": 0.01490734, + "auxiliary_loss_mlp": 0.01256023, + "balance_loss_clip": 1.13584328, + "balance_loss_mlp": 1.02008367, + "epoch": 0.6923585642998857, + "flos": 21472824794400.0, + "grad_norm": 2.0884592531699084, + "language_loss": 0.80957603, + "learning_rate": 9.131838668700167e-07, + "loss": 0.83704352, + "num_input_tokens_seen": 123900615, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.35546875, + "step": 5758, + "time_per_iteration": 3.1953063011169434 + }, + { + "auxiliary_loss_clip": 0.01494528, + "auxiliary_loss_mlp": 0.01269278, + "balance_loss_clip": 1.14054322, + "balance_loss_mlp": 1.0342927, + "epoch": 0.6924788071905249, + "flos": 21107307473280.0, + "grad_norm": 1.9265541245067734, + "language_loss": 0.86555028, + "learning_rate": 9.125300270166598e-07, + "loss": 0.89318824, + "num_input_tokens_seen": 123921220, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.34570312, + "step": 5759, + "time_per_iteration": 3.354180097579956 + }, + { + "auxiliary_loss_clip": 0.01489189, + "auxiliary_loss_mlp": 0.01257666, + "balance_loss_clip": 1.13486719, + "balance_loss_mlp": 1.02191734, + "epoch": 0.692599050081164, + "flos": 26252427530880.0, + "grad_norm": 2.2007702223656893, + "language_loss": 0.8575871, + "learning_rate": 9.118763521316324e-07, + "loss": 0.88505572, + "num_input_tokens_seen": 123941795, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.3515625, + "step": 5760, + "time_per_iteration": 3.1437950134277344 + }, + { + "auxiliary_loss_clip": 0.01487388, + "auxiliary_loss_mlp": 0.01260233, + "balance_loss_clip": 1.1314503, + "balance_loss_mlp": 1.02582026, + "epoch": 0.692719292971803, + "flos": 20887246103040.0, + "grad_norm": 1.680769488634656, + "language_loss": 0.76166129, + "learning_rate": 9.112228423140987e-07, + "loss": 0.78913748, + "num_input_tokens_seen": 123960715, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.33984375, + "step": 5761, + "time_per_iteration": 3.837231159210205 + }, + { + "auxiliary_loss_clip": 0.01492938, + "auxiliary_loss_mlp": 0.01258086, + "balance_loss_clip": 1.13723493, + "balance_loss_mlp": 1.02348173, + "epoch": 0.6928395358624422, + "flos": 25924345668000.0, + "grad_norm": 3.7777123226705696, + "language_loss": 0.86640942, + "learning_rate": 9.105694976631932e-07, + "loss": 0.89391965, + "num_input_tokens_seen": 123978625, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.33984375, + "step": 5762, + "time_per_iteration": 3.0586628913879395 + }, + { + "auxiliary_loss_clip": 0.01494088, + "auxiliary_loss_mlp": 0.01260062, + "balance_loss_clip": 1.13885331, + "balance_loss_mlp": 1.02679372, + "epoch": 0.6929597787530812, + "flos": 23588820043200.0, + "grad_norm": 2.3686129803594076, + "language_loss": 0.72851574, + "learning_rate": 9.099163182780283e-07, + "loss": 0.75605726, + "num_input_tokens_seen": 123996780, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.32617188, + "step": 5763, + "time_per_iteration": 3.0100419521331787 + }, + { + "auxiliary_loss_clip": 0.01490381, + "auxiliary_loss_mlp": 0.01268736, + "balance_loss_clip": 1.13544273, + "balance_loss_mlp": 1.03508592, + "epoch": 0.6930800216437203, + "flos": 18257395042080.0, + "grad_norm": 2.869851178897596, + "language_loss": 0.49405807, + "learning_rate": 9.092633042576916e-07, + "loss": 0.52164924, + "num_input_tokens_seen": 124014045, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.33007812, + "step": 5764, + "time_per_iteration": 3.060913562774658 + }, + { + "auxiliary_loss_clip": 0.01494901, + "auxiliary_loss_mlp": 0.01259383, + "balance_loss_clip": 1.14016104, + "balance_loss_mlp": 1.02592313, + "epoch": 0.6932002645343595, + "flos": 29171180301120.0, + "grad_norm": 1.9665314194180503, + "language_loss": 0.56422442, + "learning_rate": 9.086104557012446e-07, + "loss": 0.59176731, + "num_input_tokens_seen": 124034615, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.33007812, + "step": 5765, + "time_per_iteration": 3.1050591468811035 + }, + { + "auxiliary_loss_clip": 0.01491416, + "auxiliary_loss_mlp": 0.01260833, + "balance_loss_clip": 1.13674116, + "balance_loss_mlp": 1.02508473, + "epoch": 0.6933205074249985, + "flos": 23845065242400.0, + "grad_norm": 1.9726146422686535, + "language_loss": 0.65864909, + "learning_rate": 9.079577727077239e-07, + "loss": 0.68617153, + "num_input_tokens_seen": 124053445, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.3515625, + "step": 5766, + "time_per_iteration": 3.0236494541168213 + }, + { + "auxiliary_loss_clip": 0.01492404, + "auxiliary_loss_mlp": 0.01261993, + "balance_loss_clip": 1.13705993, + "balance_loss_mlp": 1.02529073, + "epoch": 0.6934407503156376, + "flos": 24168785366880.0, + "grad_norm": 2.168912932942287, + "language_loss": 0.71981275, + "learning_rate": 9.073052553761404e-07, + "loss": 0.74735665, + "num_input_tokens_seen": 124072810, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.36328125, + "step": 5767, + "time_per_iteration": 3.0960309505462646 + }, + { + "auxiliary_loss_clip": 0.01500059, + "auxiliary_loss_mlp": 0.01258731, + "balance_loss_clip": 1.14475203, + "balance_loss_mlp": 1.01802373, + "epoch": 0.6935609932062767, + "flos": 20633428306080.0, + "grad_norm": 7.529392878810317, + "language_loss": 0.78142625, + "learning_rate": 9.066529038054805e-07, + "loss": 0.80901408, + "num_input_tokens_seen": 124092875, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.40234375, + "step": 5768, + "time_per_iteration": 3.050602912902832 + }, + { + "auxiliary_loss_clip": 0.0149344, + "auxiliary_loss_mlp": 0.01258635, + "balance_loss_clip": 1.13797235, + "balance_loss_mlp": 1.02517557, + "epoch": 0.6936812360969158, + "flos": 18255839987520.0, + "grad_norm": 3.609473660202613, + "language_loss": 0.74024355, + "learning_rate": 9.060007180947071e-07, + "loss": 0.76776433, + "num_input_tokens_seen": 124110930, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.328125, + "step": 5769, + "time_per_iteration": 3.0908701419830322 + }, + { + "auxiliary_loss_clip": 0.01486035, + "auxiliary_loss_mlp": 0.01274972, + "balance_loss_clip": 1.12995231, + "balance_loss_mlp": 1.03865123, + "epoch": 0.6938014789875548, + "flos": 31319794131840.0, + "grad_norm": 1.9525897045087983, + "language_loss": 0.73012459, + "learning_rate": 9.053486983427534e-07, + "loss": 0.75773466, + "num_input_tokens_seen": 124132180, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.35742188, + "step": 5770, + "time_per_iteration": 3.1686792373657227 + }, + { + "auxiliary_loss_clip": 0.01485788, + "auxiliary_loss_mlp": 0.01266283, + "balance_loss_clip": 1.13084102, + "balance_loss_mlp": 1.02919924, + "epoch": 0.6939217218781939, + "flos": 17530267000320.0, + "grad_norm": 1.9880025685847202, + "language_loss": 0.70633954, + "learning_rate": 9.046968446485326e-07, + "loss": 0.73386025, + "num_input_tokens_seen": 124150585, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.36523438, + "step": 5771, + "time_per_iteration": 3.040250778198242 + }, + { + "auxiliary_loss_clip": 0.01496853, + "auxiliary_loss_mlp": 0.01271098, + "balance_loss_clip": 1.14236856, + "balance_loss_mlp": 1.0342052, + "epoch": 0.6940419647688331, + "flos": 18553768598880.0, + "grad_norm": 3.9309049353152643, + "language_loss": 0.70730764, + "learning_rate": 9.040451571109295e-07, + "loss": 0.73498714, + "num_input_tokens_seen": 124166205, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.36328125, + "step": 5772, + "time_per_iteration": 3.1566383838653564 + }, + { + "auxiliary_loss_clip": 0.01451972, + "auxiliary_loss_mlp": 0.01191345, + "balance_loss_clip": 1.11403942, + "balance_loss_mlp": 0.99794006, + "epoch": 0.6941622076594721, + "flos": 66932719319520.0, + "grad_norm": 0.8283587643609853, + "language_loss": 0.60327327, + "learning_rate": 9.033936358288042e-07, + "loss": 0.6297065, + "num_input_tokens_seen": 124219940, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.93359375, + "step": 5773, + "time_per_iteration": 3.378323793411255 + }, + { + "auxiliary_loss_clip": 0.01485618, + "auxiliary_loss_mlp": 0.01260112, + "balance_loss_clip": 1.13031232, + "balance_loss_mlp": 1.0251267, + "epoch": 0.6942824505501112, + "flos": 26580471465600.0, + "grad_norm": 1.9210462634724847, + "language_loss": 0.82697237, + "learning_rate": 9.027422809009937e-07, + "loss": 0.85442972, + "num_input_tokens_seen": 124239885, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.34375, + "step": 5774, + "time_per_iteration": 3.096740245819092 + }, + { + "auxiliary_loss_clip": 0.01487848, + "auxiliary_loss_mlp": 0.01255954, + "balance_loss_clip": 1.13222456, + "balance_loss_mlp": 1.02287602, + "epoch": 0.6944026934407503, + "flos": 21250260165600.0, + "grad_norm": 1.7598578337775301, + "language_loss": 0.83257043, + "learning_rate": 9.020910924263054e-07, + "loss": 0.86000848, + "num_input_tokens_seen": 124258410, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.32617188, + "step": 5775, + "time_per_iteration": 4.046630382537842 + }, + { + "auxiliary_loss_clip": 0.01449894, + "auxiliary_loss_mlp": 0.01197548, + "balance_loss_clip": 1.1119473, + "balance_loss_mlp": 1.0049057, + "epoch": 0.6945229363313894, + "flos": 70683434658720.0, + "grad_norm": 0.844295672074351, + "language_loss": 0.58123147, + "learning_rate": 9.014400705035261e-07, + "loss": 0.60770595, + "num_input_tokens_seen": 124315315, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.92578125, + "step": 5776, + "time_per_iteration": 3.4824419021606445 + }, + { + "auxiliary_loss_clip": 0.01492045, + "auxiliary_loss_mlp": 0.01268129, + "balance_loss_clip": 1.13754797, + "balance_loss_mlp": 1.03486025, + "epoch": 0.6946431792220285, + "flos": 18954938754720.0, + "grad_norm": 2.672474667178998, + "language_loss": 0.7695024, + "learning_rate": 9.00789215231414e-07, + "loss": 0.79710412, + "num_input_tokens_seen": 124333710, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.32617188, + "step": 5777, + "time_per_iteration": 2.947413921356201 + }, + { + "auxiliary_loss_clip": 0.01491364, + "auxiliary_loss_mlp": 0.0125908, + "balance_loss_clip": 1.13612258, + "balance_loss_mlp": 1.02294993, + "epoch": 0.6947634221126676, + "flos": 20340885493440.0, + "grad_norm": 2.5919042299508948, + "language_loss": 0.82127905, + "learning_rate": 9.001385267087056e-07, + "loss": 0.84878349, + "num_input_tokens_seen": 124352855, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.35546875, + "step": 5778, + "time_per_iteration": 3.052736282348633 + }, + { + "auxiliary_loss_clip": 0.01490561, + "auxiliary_loss_mlp": 0.01256874, + "balance_loss_clip": 1.135607, + "balance_loss_mlp": 1.02207947, + "epoch": 0.6948836650033067, + "flos": 21835800928800.0, + "grad_norm": 1.6427983783107198, + "language_loss": 0.70525229, + "learning_rate": 8.994880050341072e-07, + "loss": 0.73272663, + "num_input_tokens_seen": 124372960, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.34179688, + "step": 5779, + "time_per_iteration": 4.019910573959351 + }, + { + "auxiliary_loss_clip": 0.0149964, + "auxiliary_loss_mlp": 0.01272181, + "balance_loss_clip": 1.14547515, + "balance_loss_mlp": 1.03567004, + "epoch": 0.6950039078939457, + "flos": 23659670574720.0, + "grad_norm": 1.7337467305314813, + "language_loss": 0.77711225, + "learning_rate": 8.988376503063026e-07, + "loss": 0.80483043, + "num_input_tokens_seen": 124394220, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.359375, + "step": 5780, + "time_per_iteration": 3.8687124252319336 + }, + { + "auxiliary_loss_clip": 0.0149309, + "auxiliary_loss_mlp": 0.01265683, + "balance_loss_clip": 1.13750899, + "balance_loss_mlp": 1.02974355, + "epoch": 0.6951241507845849, + "flos": 21794269229280.0, + "grad_norm": 2.1276148452590977, + "language_loss": 0.82287192, + "learning_rate": 8.981874626239521e-07, + "loss": 0.85045964, + "num_input_tokens_seen": 124412795, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.35351562, + "step": 5781, + "time_per_iteration": 2.9755563735961914 + }, + { + "auxiliary_loss_clip": 0.01494936, + "auxiliary_loss_mlp": 0.01263047, + "balance_loss_clip": 1.14029145, + "balance_loss_mlp": 1.0284431, + "epoch": 0.695244393675224, + "flos": 14649328969920.0, + "grad_norm": 2.4700027110828344, + "language_loss": 0.8807891, + "learning_rate": 8.975374420856872e-07, + "loss": 0.908369, + "num_input_tokens_seen": 124429690, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.33984375, + "step": 5782, + "time_per_iteration": 3.1634552478790283 + }, + { + "auxiliary_loss_clip": 0.01492872, + "auxiliary_loss_mlp": 0.0125633, + "balance_loss_clip": 1.13917041, + "balance_loss_mlp": 1.02153552, + "epoch": 0.695364636565863, + "flos": 16875089406720.0, + "grad_norm": 2.39810355450629, + "language_loss": 0.72438985, + "learning_rate": 8.968875887901157e-07, + "loss": 0.75188196, + "num_input_tokens_seen": 124447070, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.34179688, + "step": 5783, + "time_per_iteration": 3.1644749641418457 + }, + { + "auxiliary_loss_clip": 0.0149454, + "auxiliary_loss_mlp": 0.01264111, + "balance_loss_clip": 1.13959694, + "balance_loss_mlp": 1.02836227, + "epoch": 0.6954848794565022, + "flos": 19356564048480.0, + "grad_norm": 2.448882909908577, + "language_loss": 0.63048553, + "learning_rate": 8.9623790283582e-07, + "loss": 0.65807211, + "num_input_tokens_seen": 124464950, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.3515625, + "step": 5784, + "time_per_iteration": 3.131988286972046 + }, + { + "auxiliary_loss_clip": 0.01495866, + "auxiliary_loss_mlp": 0.01279268, + "balance_loss_clip": 1.14093661, + "balance_loss_mlp": 1.04237533, + "epoch": 0.6956051223471412, + "flos": 18992260428480.0, + "grad_norm": 3.3411272329966017, + "language_loss": 0.77102101, + "learning_rate": 8.955883843213561e-07, + "loss": 0.79877234, + "num_input_tokens_seen": 124483965, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.36328125, + "step": 5785, + "time_per_iteration": 3.0850882530212402 + }, + { + "auxiliary_loss_clip": 0.01487096, + "auxiliary_loss_mlp": 0.01267054, + "balance_loss_clip": 1.1299541, + "balance_loss_mlp": 1.03073335, + "epoch": 0.6957253652377803, + "flos": 16109084636640.0, + "grad_norm": 2.5168294960257427, + "language_loss": 0.86951816, + "learning_rate": 8.949390333452569e-07, + "loss": 0.89705968, + "num_input_tokens_seen": 124501910, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.35742188, + "step": 5786, + "time_per_iteration": 3.0844461917877197 + }, + { + "auxiliary_loss_clip": 0.01492405, + "auxiliary_loss_mlp": 0.0126429, + "balance_loss_clip": 1.13700795, + "balance_loss_mlp": 1.03025818, + "epoch": 0.6958456081284194, + "flos": 29391279599520.0, + "grad_norm": 1.8176570910199414, + "language_loss": 0.67697626, + "learning_rate": 8.942898500060279e-07, + "loss": 0.70454323, + "num_input_tokens_seen": 124521625, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.33398438, + "step": 5787, + "time_per_iteration": 3.1479501724243164 + }, + { + "auxiliary_loss_clip": 0.01495505, + "auxiliary_loss_mlp": 0.01281049, + "balance_loss_clip": 1.14061856, + "balance_loss_mlp": 1.04358387, + "epoch": 0.6959658510190585, + "flos": 25157165124960.0, + "grad_norm": 3.427318773117331, + "language_loss": 0.71905434, + "learning_rate": 8.936408344021493e-07, + "loss": 0.74681991, + "num_input_tokens_seen": 124538540, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.36914062, + "step": 5788, + "time_per_iteration": 3.127891778945923 + }, + { + "auxiliary_loss_clip": 0.01495983, + "auxiliary_loss_mlp": 0.01279306, + "balance_loss_clip": 1.14165759, + "balance_loss_mlp": 1.04260397, + "epoch": 0.6960860939096976, + "flos": 42817716812160.0, + "grad_norm": 3.0181320362134687, + "language_loss": 0.71458161, + "learning_rate": 8.929919866320765e-07, + "loss": 0.74233449, + "num_input_tokens_seen": 124559355, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.36132812, + "step": 5789, + "time_per_iteration": 3.955609083175659 + }, + { + "auxiliary_loss_clip": 0.0149439, + "auxiliary_loss_mlp": 0.01264658, + "balance_loss_clip": 1.14056098, + "balance_loss_mlp": 1.02833796, + "epoch": 0.6962063368003367, + "flos": 17568498949920.0, + "grad_norm": 2.1225771111311786, + "language_loss": 0.81407666, + "learning_rate": 8.923433067942385e-07, + "loss": 0.84166712, + "num_input_tokens_seen": 124577920, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.35742188, + "step": 5790, + "time_per_iteration": 3.111487865447998 + }, + { + "auxiliary_loss_clip": 0.01501164, + "auxiliary_loss_mlp": 0.01268736, + "balance_loss_clip": 1.14677143, + "balance_loss_mlp": 1.03298807, + "epoch": 0.6963265796909758, + "flos": 21253787484480.0, + "grad_norm": 1.985410256766594, + "language_loss": 0.68900692, + "learning_rate": 8.916947949870417e-07, + "loss": 0.71670592, + "num_input_tokens_seen": 124597585, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.3515625, + "step": 5791, + "time_per_iteration": 3.0647714138031006 + }, + { + "auxiliary_loss_clip": 0.01454814, + "auxiliary_loss_mlp": 0.01194267, + "balance_loss_clip": 1.11780691, + "balance_loss_mlp": 1.00124359, + "epoch": 0.6964468225816148, + "flos": 68835100985280.0, + "grad_norm": 0.743255521806118, + "language_loss": 0.58092999, + "learning_rate": 8.910464513088615e-07, + "loss": 0.6074208, + "num_input_tokens_seen": 124661625, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.9296875, + "step": 5792, + "time_per_iteration": 3.613189458847046 + }, + { + "auxiliary_loss_clip": 0.01497882, + "auxiliary_loss_mlp": 0.01258144, + "balance_loss_clip": 1.14424753, + "balance_loss_mlp": 1.02201426, + "epoch": 0.696567065472254, + "flos": 18952776849600.0, + "grad_norm": 2.2265979003839798, + "language_loss": 0.78523815, + "learning_rate": 8.903982758580542e-07, + "loss": 0.81279844, + "num_input_tokens_seen": 124680565, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.359375, + "step": 5793, + "time_per_iteration": 3.040863275527954 + }, + { + "auxiliary_loss_clip": 0.01500678, + "auxiliary_loss_mlp": 0.01264076, + "balance_loss_clip": 1.14677119, + "balance_loss_mlp": 1.02851868, + "epoch": 0.696687308362893, + "flos": 22858885317600.0, + "grad_norm": 2.028222509372562, + "language_loss": 0.80773199, + "learning_rate": 8.897502687329457e-07, + "loss": 0.83537954, + "num_input_tokens_seen": 124700365, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.34960938, + "step": 5794, + "time_per_iteration": 3.0691940784454346 + }, + { + "auxiliary_loss_clip": 0.01497362, + "auxiliary_loss_mlp": 0.01258523, + "balance_loss_clip": 1.14247131, + "balance_loss_mlp": 1.02678001, + "epoch": 0.6968075512535321, + "flos": 24975259848000.0, + "grad_norm": 3.429485740983152, + "language_loss": 0.79998493, + "learning_rate": 8.891024300318382e-07, + "loss": 0.82754374, + "num_input_tokens_seen": 124718935, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.3125, + "step": 5795, + "time_per_iteration": 2.9580817222595215 + }, + { + "auxiliary_loss_clip": 0.01493165, + "auxiliary_loss_mlp": 0.01260032, + "balance_loss_clip": 1.13936615, + "balance_loss_mlp": 1.02733541, + "epoch": 0.6969277941441713, + "flos": 21032436556800.0, + "grad_norm": 2.654931355405053, + "language_loss": 0.76105541, + "learning_rate": 8.884547598530103e-07, + "loss": 0.78858739, + "num_input_tokens_seen": 124739505, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.32226562, + "step": 5796, + "time_per_iteration": 3.0431907176971436 + }, + { + "auxiliary_loss_clip": 0.01492454, + "auxiliary_loss_mlp": 0.01265976, + "balance_loss_clip": 1.13920271, + "balance_loss_mlp": 1.03079975, + "epoch": 0.6970480370348103, + "flos": 21581527993920.0, + "grad_norm": 1.7873663587795405, + "language_loss": 0.74848038, + "learning_rate": 8.8780725829471e-07, + "loss": 0.77606469, + "num_input_tokens_seen": 124757410, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.34765625, + "step": 5797, + "time_per_iteration": 3.0560898780822754 + }, + { + "auxiliary_loss_clip": 0.01492773, + "auxiliary_loss_mlp": 0.01265146, + "balance_loss_clip": 1.13876534, + "balance_loss_mlp": 1.03016067, + "epoch": 0.6971682799254494, + "flos": 22421379620160.0, + "grad_norm": 4.60403272600183, + "language_loss": 0.77927601, + "learning_rate": 8.87159925455165e-07, + "loss": 0.80685514, + "num_input_tokens_seen": 124777240, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.34570312, + "step": 5798, + "time_per_iteration": 3.144347906112671 + }, + { + "auxiliary_loss_clip": 0.01491894, + "auxiliary_loss_mlp": 0.0126604, + "balance_loss_clip": 1.13816571, + "balance_loss_mlp": 1.0314362, + "epoch": 0.6972885228160886, + "flos": 20007607472640.0, + "grad_norm": 1.8677460421055798, + "language_loss": 0.73132348, + "learning_rate": 8.865127614325738e-07, + "loss": 0.75890279, + "num_input_tokens_seen": 124795670, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.34179688, + "step": 5799, + "time_per_iteration": 3.081949234008789 + }, + { + "auxiliary_loss_clip": 0.01496009, + "auxiliary_loss_mlp": 0.0126373, + "balance_loss_clip": 1.14125085, + "balance_loss_mlp": 1.02531171, + "epoch": 0.6974087657067276, + "flos": 37856550152160.0, + "grad_norm": 1.9869561187889448, + "language_loss": 0.6647985, + "learning_rate": 8.85865766325113e-07, + "loss": 0.69239587, + "num_input_tokens_seen": 124819600, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.37890625, + "step": 5800, + "time_per_iteration": 3.100449323654175 + }, + { + "auxiliary_loss_clip": 0.01498407, + "auxiliary_loss_mlp": 0.01265144, + "balance_loss_clip": 1.14446354, + "balance_loss_mlp": 1.02787018, + "epoch": 0.6975290085973667, + "flos": 29491411034880.0, + "grad_norm": 2.2702886299281078, + "language_loss": 0.72132689, + "learning_rate": 8.852189402309287e-07, + "loss": 0.7489624, + "num_input_tokens_seen": 124838785, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.3671875, + "step": 5801, + "time_per_iteration": 3.0326976776123047 + }, + { + "auxiliary_loss_clip": 0.01496076, + "auxiliary_loss_mlp": 0.01277389, + "balance_loss_clip": 1.14127898, + "balance_loss_mlp": 1.04183161, + "epoch": 0.6976492514880057, + "flos": 12897258059520.0, + "grad_norm": 8.240028694325416, + "language_loss": 0.74234861, + "learning_rate": 8.845722832481441e-07, + "loss": 0.77008325, + "num_input_tokens_seen": 124854215, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.3515625, + "step": 5802, + "time_per_iteration": 3.017996311187744 + }, + { + "auxiliary_loss_clip": 0.01489512, + "auxiliary_loss_mlp": 0.0127742, + "balance_loss_clip": 1.13452291, + "balance_loss_mlp": 1.04167175, + "epoch": 0.6977694943786449, + "flos": 24355242023040.0, + "grad_norm": 2.0444383865949436, + "language_loss": 0.77570283, + "learning_rate": 8.83925795474858e-07, + "loss": 0.80337214, + "num_input_tokens_seen": 124874340, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.35351562, + "step": 5803, + "time_per_iteration": 3.92741322517395 + }, + { + "auxiliary_loss_clip": 0.01493357, + "auxiliary_loss_mlp": 0.01274221, + "balance_loss_clip": 1.13745451, + "balance_loss_mlp": 1.03942609, + "epoch": 0.6978897372692839, + "flos": 29901077098560.0, + "grad_norm": 2.70766877208789, + "language_loss": 0.59711748, + "learning_rate": 8.832794770091414e-07, + "loss": 0.62479329, + "num_input_tokens_seen": 124895175, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.34765625, + "step": 5804, + "time_per_iteration": 3.160003423690796 + }, + { + "auxiliary_loss_clip": 0.01490334, + "auxiliary_loss_mlp": 0.01260817, + "balance_loss_clip": 1.1362313, + "balance_loss_mlp": 1.02602196, + "epoch": 0.698009980159923, + "flos": 21763698768000.0, + "grad_norm": 2.1919687196069995, + "language_loss": 0.82283288, + "learning_rate": 8.826333279490401e-07, + "loss": 0.85034442, + "num_input_tokens_seen": 124915810, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.34179688, + "step": 5805, + "time_per_iteration": 3.1347856521606445 + }, + { + "auxiliary_loss_clip": 0.01493431, + "auxiliary_loss_mlp": 0.01261864, + "balance_loss_clip": 1.1401639, + "balance_loss_mlp": 1.02745104, + "epoch": 0.6981302230505622, + "flos": 19858775915520.0, + "grad_norm": 2.191654304373434, + "language_loss": 0.68476617, + "learning_rate": 8.819873483925748e-07, + "loss": 0.71231914, + "num_input_tokens_seen": 124932930, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.34179688, + "step": 5806, + "time_per_iteration": 3.968275785446167 + }, + { + "auxiliary_loss_clip": 0.01494133, + "auxiliary_loss_mlp": 0.01261912, + "balance_loss_clip": 1.13926411, + "balance_loss_mlp": 1.02730823, + "epoch": 0.6982504659412012, + "flos": 22200749327520.0, + "grad_norm": 2.376019847902374, + "language_loss": 0.7467649, + "learning_rate": 8.81341538437739e-07, + "loss": 0.77432531, + "num_input_tokens_seen": 124951220, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.33984375, + "step": 5807, + "time_per_iteration": 3.9923834800720215 + }, + { + "auxiliary_loss_clip": 0.01487147, + "auxiliary_loss_mlp": 0.0126451, + "balance_loss_clip": 1.13206029, + "balance_loss_mlp": 1.0281899, + "epoch": 0.6983707088318403, + "flos": 35591306136480.0, + "grad_norm": 1.8873497108753627, + "language_loss": 0.68306851, + "learning_rate": 8.80695898182503e-07, + "loss": 0.71058512, + "num_input_tokens_seen": 124972200, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.35742188, + "step": 5808, + "time_per_iteration": 3.0797533988952637 + }, + { + "auxiliary_loss_clip": 0.01447377, + "auxiliary_loss_mlp": 0.01192879, + "balance_loss_clip": 1.1093384, + "balance_loss_mlp": 1.00214386, + "epoch": 0.6984909517224794, + "flos": 65446944570720.0, + "grad_norm": 0.8355947158401845, + "language_loss": 0.65023577, + "learning_rate": 8.800504277248093e-07, + "loss": 0.67663836, + "num_input_tokens_seen": 125036950, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.90625, + "step": 5809, + "time_per_iteration": 3.4740827083587646 + }, + { + "auxiliary_loss_clip": 0.01493699, + "auxiliary_loss_mlp": 0.01264194, + "balance_loss_clip": 1.13818145, + "balance_loss_mlp": 1.02882731, + "epoch": 0.6986111946131185, + "flos": 18548648297280.0, + "grad_norm": 3.0214048564585188, + "language_loss": 0.75230992, + "learning_rate": 8.794051271625753e-07, + "loss": 0.77988887, + "num_input_tokens_seen": 125054585, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.34765625, + "step": 5810, + "time_per_iteration": 3.0985219478607178 + }, + { + "auxiliary_loss_clip": 0.01485954, + "auxiliary_loss_mlp": 0.01266019, + "balance_loss_clip": 1.13078976, + "balance_loss_mlp": 1.0321784, + "epoch": 0.6987314375037575, + "flos": 23041397445120.0, + "grad_norm": 1.6008411321847873, + "language_loss": 0.8308866, + "learning_rate": 8.787599965936925e-07, + "loss": 0.85840636, + "num_input_tokens_seen": 125075515, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.33203125, + "step": 5811, + "time_per_iteration": 3.093592882156372 + }, + { + "auxiliary_loss_clip": 0.01497621, + "auxiliary_loss_mlp": 0.0127258, + "balance_loss_clip": 1.14324439, + "balance_loss_mlp": 1.0381664, + "epoch": 0.6988516803943967, + "flos": 38403517612320.0, + "grad_norm": 2.1465364966196985, + "language_loss": 0.72104317, + "learning_rate": 8.781150361160261e-07, + "loss": 0.7487452, + "num_input_tokens_seen": 125097425, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.33789062, + "step": 5812, + "time_per_iteration": 3.0884599685668945 + }, + { + "auxiliary_loss_clip": 0.01488315, + "auxiliary_loss_mlp": 0.01262841, + "balance_loss_clip": 1.13240182, + "balance_loss_mlp": 1.02785563, + "epoch": 0.6989719232850358, + "flos": 24099262320960.0, + "grad_norm": 1.8289519235144351, + "language_loss": 0.74054009, + "learning_rate": 8.774702458274181e-07, + "loss": 0.76805168, + "num_input_tokens_seen": 125117830, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.34375, + "step": 5813, + "time_per_iteration": 2.9997518062591553 + }, + { + "auxiliary_loss_clip": 0.01489061, + "auxiliary_loss_mlp": 0.01260858, + "balance_loss_clip": 1.13496888, + "balance_loss_mlp": 1.02339339, + "epoch": 0.6990921661756748, + "flos": 14868631776960.0, + "grad_norm": 2.721689006046676, + "language_loss": 0.70686775, + "learning_rate": 8.768256258256799e-07, + "loss": 0.73436695, + "num_input_tokens_seen": 125134455, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.36914062, + "step": 5814, + "time_per_iteration": 2.913804292678833 + }, + { + "auxiliary_loss_clip": 0.01494576, + "auxiliary_loss_mlp": 0.0126384, + "balance_loss_clip": 1.14022064, + "balance_loss_mlp": 1.02980876, + "epoch": 0.699212409066314, + "flos": 20195808824160.0, + "grad_norm": 2.5215626040541492, + "language_loss": 0.73911762, + "learning_rate": 8.76181176208602e-07, + "loss": 0.76670182, + "num_input_tokens_seen": 125152555, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.33398438, + "step": 5815, + "time_per_iteration": 3.0749764442443848 + }, + { + "auxiliary_loss_clip": 0.01489993, + "auxiliary_loss_mlp": 0.01263832, + "balance_loss_clip": 1.13415468, + "balance_loss_mlp": 1.02713013, + "epoch": 0.699332651956953, + "flos": 19429955766720.0, + "grad_norm": 1.9081772233541048, + "language_loss": 0.73451263, + "learning_rate": 8.755368970739461e-07, + "loss": 0.76205087, + "num_input_tokens_seen": 125171915, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.36132812, + "step": 5816, + "time_per_iteration": 3.853742837905884 + }, + { + "auxiliary_loss_clip": 0.01488099, + "auxiliary_loss_mlp": 0.01272599, + "balance_loss_clip": 1.13269818, + "balance_loss_mlp": 1.03418005, + "epoch": 0.6994528948475921, + "flos": 16145571890880.0, + "grad_norm": 2.589396067635058, + "language_loss": 0.61253285, + "learning_rate": 8.748927885194479e-07, + "loss": 0.64013982, + "num_input_tokens_seen": 125190220, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.37890625, + "step": 5817, + "time_per_iteration": 3.02323055267334 + }, + { + "auxiliary_loss_clip": 0.01448484, + "auxiliary_loss_mlp": 0.01196526, + "balance_loss_clip": 1.11018348, + "balance_loss_mlp": 1.00579071, + "epoch": 0.6995731377382313, + "flos": 64958538918240.0, + "grad_norm": 0.8035453125096023, + "language_loss": 0.5734576, + "learning_rate": 8.742488506428209e-07, + "loss": 0.5999077, + "num_input_tokens_seen": 125249310, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.90625, + "step": 5818, + "time_per_iteration": 3.442296266555786 + }, + { + "auxiliary_loss_clip": 0.01490534, + "auxiliary_loss_mlp": 0.01267035, + "balance_loss_clip": 1.13417268, + "balance_loss_mlp": 1.03300285, + "epoch": 0.6996933806288703, + "flos": 24902474980320.0, + "grad_norm": 2.0251270851333665, + "language_loss": 0.77927744, + "learning_rate": 8.736050835417466e-07, + "loss": 0.80685318, + "num_input_tokens_seen": 125269350, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3359375, + "step": 5819, + "time_per_iteration": 2.987043857574463 + }, + { + "auxiliary_loss_clip": 0.01491981, + "auxiliary_loss_mlp": 0.01263859, + "balance_loss_clip": 1.1357733, + "balance_loss_mlp": 1.02830124, + "epoch": 0.6998136235195094, + "flos": 20779946245440.0, + "grad_norm": 3.655959754748782, + "language_loss": 0.61746991, + "learning_rate": 8.729614873138862e-07, + "loss": 0.64502835, + "num_input_tokens_seen": 125286985, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3515625, + "step": 5820, + "time_per_iteration": 2.9071972370147705 + }, + { + "auxiliary_loss_clip": 0.01497062, + "auxiliary_loss_mlp": 0.01278331, + "balance_loss_clip": 1.14088964, + "balance_loss_mlp": 1.04353678, + "epoch": 0.6999338664101485, + "flos": 23735793120480.0, + "grad_norm": 2.019827098125248, + "language_loss": 0.78075612, + "learning_rate": 8.723180620568716e-07, + "loss": 0.80851001, + "num_input_tokens_seen": 125306240, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.34375, + "step": 5821, + "time_per_iteration": 3.042598247528076 + }, + { + "auxiliary_loss_clip": 0.01491638, + "auxiliary_loss_mlp": 0.01260448, + "balance_loss_clip": 1.13668382, + "balance_loss_mlp": 1.02736986, + "epoch": 0.7000541093007876, + "flos": 19866627044640.0, + "grad_norm": 1.9983010463509057, + "language_loss": 0.85301864, + "learning_rate": 8.716748078683116e-07, + "loss": 0.88053954, + "num_input_tokens_seen": 125323015, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.328125, + "step": 5822, + "time_per_iteration": 2.989520788192749 + }, + { + "auxiliary_loss_clip": 0.01490183, + "auxiliary_loss_mlp": 0.01262786, + "balance_loss_clip": 1.13456893, + "balance_loss_mlp": 1.02722907, + "epoch": 0.7001743521914267, + "flos": 29681622578880.0, + "grad_norm": 2.4795299542737013, + "language_loss": 0.68929017, + "learning_rate": 8.710317248457855e-07, + "loss": 0.71681988, + "num_input_tokens_seen": 125342630, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.34960938, + "step": 5823, + "time_per_iteration": 2.9872679710388184 + }, + { + "auxiliary_loss_clip": 0.01496644, + "auxiliary_loss_mlp": 0.01263125, + "balance_loss_clip": 1.14081204, + "balance_loss_mlp": 1.02756763, + "epoch": 0.7002945950820658, + "flos": 27492197683680.0, + "grad_norm": 1.9140220125931937, + "language_loss": 0.72089976, + "learning_rate": 8.703888130868482e-07, + "loss": 0.74849749, + "num_input_tokens_seen": 125364480, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.34960938, + "step": 5824, + "time_per_iteration": 3.031712293624878 + }, + { + "auxiliary_loss_clip": 0.01494998, + "auxiliary_loss_mlp": 0.012662, + "balance_loss_clip": 1.14028955, + "balance_loss_mlp": 1.03235936, + "epoch": 0.7004148379727049, + "flos": 22160355472800.0, + "grad_norm": 2.2154471256440305, + "language_loss": 0.82030219, + "learning_rate": 8.697460726890307e-07, + "loss": 0.84791422, + "num_input_tokens_seen": 125381625, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.33398438, + "step": 5825, + "time_per_iteration": 2.973734140396118 + }, + { + "auxiliary_loss_clip": 0.01494067, + "auxiliary_loss_mlp": 0.01267854, + "balance_loss_clip": 1.13839054, + "balance_loss_mlp": 1.03172421, + "epoch": 0.7005350808633439, + "flos": 19425594028320.0, + "grad_norm": 2.1415605594952187, + "language_loss": 0.90636873, + "learning_rate": 8.691035037498354e-07, + "loss": 0.93398792, + "num_input_tokens_seen": 125397615, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.35546875, + "step": 5826, + "time_per_iteration": 2.9907593727111816 + }, + { + "auxiliary_loss_clip": 0.01490304, + "auxiliary_loss_mlp": 0.01262339, + "balance_loss_clip": 1.13429058, + "balance_loss_mlp": 1.02887917, + "epoch": 0.7006553237539831, + "flos": 23478941070720.0, + "grad_norm": 2.899292177310142, + "language_loss": 0.72402382, + "learning_rate": 8.684611063667391e-07, + "loss": 0.75155026, + "num_input_tokens_seen": 125418080, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.33007812, + "step": 5827, + "time_per_iteration": 3.0927228927612305 + }, + { + "auxiliary_loss_clip": 0.01496879, + "auxiliary_loss_mlp": 0.01256856, + "balance_loss_clip": 1.14231563, + "balance_loss_mlp": 1.02072644, + "epoch": 0.7007755666446221, + "flos": 31215604383360.0, + "grad_norm": 1.8944770407854201, + "language_loss": 0.77142, + "learning_rate": 8.678188806371935e-07, + "loss": 0.79895735, + "num_input_tokens_seen": 125440115, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.35546875, + "step": 5828, + "time_per_iteration": 3.0965540409088135 + }, + { + "auxiliary_loss_clip": 0.01492146, + "auxiliary_loss_mlp": 0.01253867, + "balance_loss_clip": 1.13761628, + "balance_loss_mlp": 1.02078938, + "epoch": 0.7008958095352612, + "flos": 18151839879840.0, + "grad_norm": 1.8424506306257982, + "language_loss": 0.85550749, + "learning_rate": 8.671768266586228e-07, + "loss": 0.88296765, + "num_input_tokens_seen": 125458240, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.32617188, + "step": 5829, + "time_per_iteration": 2.976120948791504 + }, + { + "auxiliary_loss_clip": 0.01491698, + "auxiliary_loss_mlp": 0.01264548, + "balance_loss_clip": 1.13496757, + "balance_loss_mlp": 1.02937198, + "epoch": 0.7010160524259004, + "flos": 27454686369120.0, + "grad_norm": 1.7626409242629273, + "language_loss": 0.78352308, + "learning_rate": 8.665349445284275e-07, + "loss": 0.81108552, + "num_input_tokens_seen": 125477980, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.34570312, + "step": 5830, + "time_per_iteration": 2.961733102798462 + }, + { + "auxiliary_loss_clip": 0.01486661, + "auxiliary_loss_mlp": 0.01269943, + "balance_loss_clip": 1.13234639, + "balance_loss_mlp": 1.03514814, + "epoch": 0.7011362953165394, + "flos": 23844344607360.0, + "grad_norm": 1.5283623044773642, + "language_loss": 0.8094641, + "learning_rate": 8.658932343439799e-07, + "loss": 0.83703011, + "num_input_tokens_seen": 125497765, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.34179688, + "step": 5831, + "time_per_iteration": 3.758798122406006 + }, + { + "auxiliary_loss_clip": 0.01492329, + "auxiliary_loss_mlp": 0.01279614, + "balance_loss_clip": 1.13693357, + "balance_loss_mlp": 1.04424691, + "epoch": 0.7012565382071785, + "flos": 24825555943200.0, + "grad_norm": 14.146358887988024, + "language_loss": 0.77700472, + "learning_rate": 8.65251696202627e-07, + "loss": 0.80472416, + "num_input_tokens_seen": 125514145, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.34960938, + "step": 5832, + "time_per_iteration": 2.970341205596924 + }, + { + "auxiliary_loss_clip": 0.0148419, + "auxiliary_loss_mlp": 0.01267057, + "balance_loss_clip": 1.12730229, + "balance_loss_mlp": 1.03035545, + "epoch": 0.7013767810978175, + "flos": 21399584788800.0, + "grad_norm": 3.447412619204077, + "language_loss": 0.87973005, + "learning_rate": 8.646103302016896e-07, + "loss": 0.90724254, + "num_input_tokens_seen": 125533115, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.36132812, + "step": 5833, + "time_per_iteration": 2.9998271465301514 + }, + { + "auxiliary_loss_clip": 0.01482929, + "auxiliary_loss_mlp": 0.01271344, + "balance_loss_clip": 1.1258136, + "balance_loss_mlp": 1.03521466, + "epoch": 0.7014970239884567, + "flos": 16688822391360.0, + "grad_norm": 1.790489924365349, + "language_loss": 0.8888067, + "learning_rate": 8.639691364384614e-07, + "loss": 0.91634947, + "num_input_tokens_seen": 125550740, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.35742188, + "step": 5834, + "time_per_iteration": 3.819125175476074 + }, + { + "auxiliary_loss_clip": 0.01484327, + "auxiliary_loss_mlp": 0.01269707, + "balance_loss_clip": 1.12857699, + "balance_loss_mlp": 1.03357744, + "epoch": 0.7016172668790958, + "flos": 12569934759840.0, + "grad_norm": 2.4985169273306016, + "language_loss": 0.7241044, + "learning_rate": 8.633281150102136e-07, + "loss": 0.75164473, + "num_input_tokens_seen": 125567590, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.35742188, + "step": 5835, + "time_per_iteration": 3.8399100303649902 + }, + { + "auxiliary_loss_clip": 0.0148352, + "auxiliary_loss_mlp": 0.01254176, + "balance_loss_clip": 1.12798822, + "balance_loss_mlp": 1.01976252, + "epoch": 0.7017375097697348, + "flos": 17454409951680.0, + "grad_norm": 2.96373033652657, + "language_loss": 0.68399489, + "learning_rate": 8.626872660141855e-07, + "loss": 0.71137184, + "num_input_tokens_seen": 125585500, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.33789062, + "step": 5836, + "time_per_iteration": 3.0250868797302246 + }, + { + "auxiliary_loss_clip": 0.01487181, + "auxiliary_loss_mlp": 0.01255907, + "balance_loss_clip": 1.13136864, + "balance_loss_mlp": 1.01996839, + "epoch": 0.701857752660374, + "flos": 18514891870560.0, + "grad_norm": 1.733849941793182, + "language_loss": 0.74823248, + "learning_rate": 8.620465895475957e-07, + "loss": 0.77566338, + "num_input_tokens_seen": 125603720, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.35351562, + "step": 5837, + "time_per_iteration": 2.989727258682251 + }, + { + "auxiliary_loss_clip": 0.01492656, + "auxiliary_loss_mlp": 0.0126115, + "balance_loss_clip": 1.13810802, + "balance_loss_mlp": 1.02711797, + "epoch": 0.701977995551013, + "flos": 24428292387840.0, + "grad_norm": 1.7656629900789118, + "language_loss": 0.75362968, + "learning_rate": 8.614060857076333e-07, + "loss": 0.78116781, + "num_input_tokens_seen": 125624390, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.33398438, + "step": 5838, + "time_per_iteration": 3.031402349472046 + }, + { + "auxiliary_loss_clip": 0.01490911, + "auxiliary_loss_mlp": 0.01260513, + "balance_loss_clip": 1.13613057, + "balance_loss_mlp": 1.02476501, + "epoch": 0.7020982384416521, + "flos": 23004986047200.0, + "grad_norm": 2.0161754166344985, + "language_loss": 0.74754608, + "learning_rate": 8.60765754591462e-07, + "loss": 0.7750603, + "num_input_tokens_seen": 125644085, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.3515625, + "step": 5839, + "time_per_iteration": 3.16679310798645 + }, + { + "auxiliary_loss_clip": 0.01488713, + "auxiliary_loss_mlp": 0.01269419, + "balance_loss_clip": 1.1328938, + "balance_loss_mlp": 1.03481567, + "epoch": 0.7022184813322913, + "flos": 20451181675680.0, + "grad_norm": 1.9758260393280256, + "language_loss": 0.73258829, + "learning_rate": 8.601255962962211e-07, + "loss": 0.76016963, + "num_input_tokens_seen": 125663095, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.34179688, + "step": 5840, + "time_per_iteration": 3.1533803939819336 + }, + { + "auxiliary_loss_clip": 0.01500168, + "auxiliary_loss_mlp": 0.0127167, + "balance_loss_clip": 1.14526033, + "balance_loss_mlp": 1.03439569, + "epoch": 0.7023387242229303, + "flos": 19792476763200.0, + "grad_norm": 2.5903751696140462, + "language_loss": 0.72246444, + "learning_rate": 8.594856109190194e-07, + "loss": 0.75018275, + "num_input_tokens_seen": 125680125, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.3671875, + "step": 5841, + "time_per_iteration": 3.180879831314087 + }, + { + "auxiliary_loss_clip": 0.01489331, + "auxiliary_loss_mlp": 0.01260402, + "balance_loss_clip": 1.13312078, + "balance_loss_mlp": 1.02484477, + "epoch": 0.7024589671135694, + "flos": 33262304155200.0, + "grad_norm": 1.664544940634266, + "language_loss": 0.69350511, + "learning_rate": 8.588457985569446e-07, + "loss": 0.7210024, + "num_input_tokens_seen": 125703035, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.34960938, + "step": 5842, + "time_per_iteration": 3.2585251331329346 + }, + { + "auxiliary_loss_clip": 0.01493469, + "auxiliary_loss_mlp": 0.01271968, + "balance_loss_clip": 1.13863516, + "balance_loss_mlp": 1.03621984, + "epoch": 0.7025792100042085, + "flos": 19101001556160.0, + "grad_norm": 2.172837080709976, + "language_loss": 0.71408099, + "learning_rate": 8.582061593070542e-07, + "loss": 0.74173534, + "num_input_tokens_seen": 125723765, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.35351562, + "step": 5843, + "time_per_iteration": 3.187650203704834 + }, + { + "auxiliary_loss_clip": 0.01494034, + "auxiliary_loss_mlp": 0.01261799, + "balance_loss_clip": 1.1390568, + "balance_loss_mlp": 1.027004, + "epoch": 0.7026994528948476, + "flos": 18954597401280.0, + "grad_norm": 7.219045322402389, + "language_loss": 0.76867545, + "learning_rate": 8.57566693266383e-07, + "loss": 0.79623377, + "num_input_tokens_seen": 125741455, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.34179688, + "step": 5844, + "time_per_iteration": 3.8587646484375 + }, + { + "auxiliary_loss_clip": 0.01483129, + "auxiliary_loss_mlp": 0.01269032, + "balance_loss_clip": 1.12763667, + "balance_loss_mlp": 1.03690791, + "epoch": 0.7028196957854866, + "flos": 19538696894400.0, + "grad_norm": 2.7251128538986094, + "language_loss": 0.69266868, + "learning_rate": 8.569274005319354e-07, + "loss": 0.72019029, + "num_input_tokens_seen": 125759855, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.31640625, + "step": 5845, + "time_per_iteration": 3.110292673110962 + }, + { + "auxiliary_loss_clip": 0.01495224, + "auxiliary_loss_mlp": 0.01267959, + "balance_loss_clip": 1.14128542, + "balance_loss_mlp": 1.03221059, + "epoch": 0.7029399386761258, + "flos": 20847155673600.0, + "grad_norm": 2.606639667035364, + "language_loss": 0.79527843, + "learning_rate": 8.562882812006913e-07, + "loss": 0.82291031, + "num_input_tokens_seen": 125777345, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.3515625, + "step": 5846, + "time_per_iteration": 3.1419687271118164 + }, + { + "auxiliary_loss_clip": 0.01487318, + "auxiliary_loss_mlp": 0.01260096, + "balance_loss_clip": 1.13281798, + "balance_loss_mlp": 1.0264461, + "epoch": 0.7030601815667649, + "flos": 22057227712800.0, + "grad_norm": 1.9564943438764117, + "language_loss": 0.7795974, + "learning_rate": 8.556493353696066e-07, + "loss": 0.80707151, + "num_input_tokens_seen": 125796345, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.33203125, + "step": 5847, + "time_per_iteration": 2.9899895191192627 + }, + { + "auxiliary_loss_clip": 0.01490171, + "auxiliary_loss_mlp": 0.01264143, + "balance_loss_clip": 1.13397324, + "balance_loss_mlp": 1.02725029, + "epoch": 0.7031804244574039, + "flos": 27201209925600.0, + "grad_norm": 2.326608135711757, + "language_loss": 0.68550503, + "learning_rate": 8.550105631356077e-07, + "loss": 0.7130481, + "num_input_tokens_seen": 125816070, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.36328125, + "step": 5848, + "time_per_iteration": 3.054245948791504 + }, + { + "auxiliary_loss_clip": 0.01492295, + "auxiliary_loss_mlp": 0.01272022, + "balance_loss_clip": 1.13640642, + "balance_loss_mlp": 1.03760934, + "epoch": 0.7033006673480431, + "flos": 22381896041280.0, + "grad_norm": 2.023037898990807, + "language_loss": 0.77246141, + "learning_rate": 8.543719645955961e-07, + "loss": 0.80010462, + "num_input_tokens_seen": 125834400, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.33789062, + "step": 5849, + "time_per_iteration": 3.058361530303955 + }, + { + "auxiliary_loss_clip": 0.01489343, + "auxiliary_loss_mlp": 0.01260611, + "balance_loss_clip": 1.13494587, + "balance_loss_mlp": 1.02524447, + "epoch": 0.7034209102386821, + "flos": 24718218157440.0, + "grad_norm": 1.8155863038689846, + "language_loss": 0.74468946, + "learning_rate": 8.537335398464467e-07, + "loss": 0.77218902, + "num_input_tokens_seen": 125854720, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.34765625, + "step": 5850, + "time_per_iteration": 3.067060947418213 + }, + { + "auxiliary_loss_clip": 0.01491549, + "auxiliary_loss_mlp": 0.01277718, + "balance_loss_clip": 1.13684249, + "balance_loss_mlp": 1.04120636, + "epoch": 0.7035411531293212, + "flos": 22557884525280.0, + "grad_norm": 3.1480911942438645, + "language_loss": 0.85609877, + "learning_rate": 8.53095288985007e-07, + "loss": 0.88379145, + "num_input_tokens_seen": 125868455, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.359375, + "step": 5851, + "time_per_iteration": 3.258253335952759 + }, + { + "auxiliary_loss_clip": 0.01487434, + "auxiliary_loss_mlp": 0.01259658, + "balance_loss_clip": 1.13260508, + "balance_loss_mlp": 1.02657962, + "epoch": 0.7036613960199604, + "flos": 22677207609600.0, + "grad_norm": 1.6873236311465039, + "language_loss": 0.82126784, + "learning_rate": 8.524572121081009e-07, + "loss": 0.84873879, + "num_input_tokens_seen": 125888555, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.32617188, + "step": 5852, + "time_per_iteration": 3.1545968055725098 + }, + { + "auxiliary_loss_clip": 0.01492122, + "auxiliary_loss_mlp": 0.01280104, + "balance_loss_clip": 1.13606596, + "balance_loss_mlp": 1.0437839, + "epoch": 0.7037816389105994, + "flos": 22494354128640.0, + "grad_norm": 2.0254571032134745, + "language_loss": 0.63079357, + "learning_rate": 8.518193093125232e-07, + "loss": 0.65851593, + "num_input_tokens_seen": 125907610, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.35742188, + "step": 5853, + "time_per_iteration": 2.945275068283081 + }, + { + "auxiliary_loss_clip": 0.01495665, + "auxiliary_loss_mlp": 0.01274289, + "balance_loss_clip": 1.14216268, + "balance_loss_mlp": 1.03834963, + "epoch": 0.7039018818012385, + "flos": 27089851754880.0, + "grad_norm": 1.6342989821547529, + "language_loss": 0.80957234, + "learning_rate": 8.511815806950436e-07, + "loss": 0.83727187, + "num_input_tokens_seen": 125928640, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.35351562, + "step": 5854, + "time_per_iteration": 3.0366852283477783 + }, + { + "auxiliary_loss_clip": 0.01481922, + "auxiliary_loss_mlp": 0.01265177, + "balance_loss_clip": 1.12622094, + "balance_loss_mlp": 1.03362513, + "epoch": 0.7040221246918776, + "flos": 17751807568800.0, + "grad_norm": 1.8067774809519437, + "language_loss": 0.7796253, + "learning_rate": 8.505440263524044e-07, + "loss": 0.8070963, + "num_input_tokens_seen": 125947485, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.3125, + "step": 5855, + "time_per_iteration": 2.968451738357544 + }, + { + "auxiliary_loss_clip": 0.01488118, + "auxiliary_loss_mlp": 0.01270421, + "balance_loss_clip": 1.13306546, + "balance_loss_mlp": 1.03505397, + "epoch": 0.7041423675825167, + "flos": 16281166520160.0, + "grad_norm": 2.671746125875852, + "language_loss": 0.88330328, + "learning_rate": 8.49906646381322e-07, + "loss": 0.91088873, + "num_input_tokens_seen": 125960320, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.34765625, + "step": 5856, + "time_per_iteration": 3.1167421340942383 + }, + { + "auxiliary_loss_clip": 0.01487765, + "auxiliary_loss_mlp": 0.01265437, + "balance_loss_clip": 1.13274693, + "balance_loss_mlp": 1.03064203, + "epoch": 0.7042626104731557, + "flos": 25485512484960.0, + "grad_norm": 1.8782128382212993, + "language_loss": 0.72324955, + "learning_rate": 8.492694408784884e-07, + "loss": 0.75078154, + "num_input_tokens_seen": 125980575, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.34179688, + "step": 5857, + "time_per_iteration": 3.103498697280884 + }, + { + "auxiliary_loss_clip": 0.01492195, + "auxiliary_loss_mlp": 0.01256109, + "balance_loss_clip": 1.13873172, + "balance_loss_mlp": 1.02303123, + "epoch": 0.7043828533637949, + "flos": 17859524636160.0, + "grad_norm": 2.578838050071845, + "language_loss": 0.62815058, + "learning_rate": 8.486324099405642e-07, + "loss": 0.65563369, + "num_input_tokens_seen": 125997420, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.32421875, + "step": 5858, + "time_per_iteration": 2.9812302589416504 + }, + { + "auxiliary_loss_clip": 0.01487613, + "auxiliary_loss_mlp": 0.01265356, + "balance_loss_clip": 1.13172114, + "balance_loss_mlp": 1.03265917, + "epoch": 0.704503096254434, + "flos": 29496683049120.0, + "grad_norm": 1.9874113081186102, + "language_loss": 0.74730593, + "learning_rate": 8.479955536641887e-07, + "loss": 0.77483565, + "num_input_tokens_seen": 126018915, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.32226562, + "step": 5859, + "time_per_iteration": 3.975736618041992 + }, + { + "auxiliary_loss_clip": 0.01483839, + "auxiliary_loss_mlp": 0.01273167, + "balance_loss_clip": 1.12881613, + "balance_loss_mlp": 1.04085159, + "epoch": 0.704623339145073, + "flos": 30923440852320.0, + "grad_norm": 2.825808482034551, + "language_loss": 0.6658994, + "learning_rate": 8.473588721459716e-07, + "loss": 0.69346946, + "num_input_tokens_seen": 126038825, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.3203125, + "step": 5860, + "time_per_iteration": 3.0868911743164062 + }, + { + "auxiliary_loss_clip": 0.01495158, + "auxiliary_loss_mlp": 0.01277932, + "balance_loss_clip": 1.13979614, + "balance_loss_mlp": 1.04046679, + "epoch": 0.7047435820357122, + "flos": 23917015690560.0, + "grad_norm": 2.1616374581831983, + "language_loss": 0.70654029, + "learning_rate": 8.467223654824967e-07, + "loss": 0.73427117, + "num_input_tokens_seen": 126058280, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.36914062, + "step": 5861, + "time_per_iteration": 3.0175435543060303 + }, + { + "auxiliary_loss_clip": 0.01493684, + "auxiliary_loss_mlp": 0.01258311, + "balance_loss_clip": 1.13933611, + "balance_loss_mlp": 1.02389765, + "epoch": 0.7048638249263512, + "flos": 46497240266400.0, + "grad_norm": 2.227486526146896, + "language_loss": 0.6281867, + "learning_rate": 8.460860337703233e-07, + "loss": 0.65570664, + "num_input_tokens_seen": 126078885, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.33789062, + "step": 5862, + "time_per_iteration": 4.199786424636841 + }, + { + "auxiliary_loss_clip": 0.01492919, + "auxiliary_loss_mlp": 0.01261416, + "balance_loss_clip": 1.13855636, + "balance_loss_mlp": 1.01861084, + "epoch": 0.7049840678169903, + "flos": 21691369038240.0, + "grad_norm": 2.1163424262478867, + "language_loss": 0.70799392, + "learning_rate": 8.454498771059797e-07, + "loss": 0.73553729, + "num_input_tokens_seen": 126098260, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.42382812, + "step": 5863, + "time_per_iteration": 3.9866929054260254 + }, + { + "auxiliary_loss_clip": 0.01485213, + "auxiliary_loss_mlp": 0.01270105, + "balance_loss_clip": 1.12952745, + "balance_loss_mlp": 1.03397489, + "epoch": 0.7051043107076294, + "flos": 18407023090560.0, + "grad_norm": 2.8402956286469756, + "language_loss": 0.83748651, + "learning_rate": 8.448138955859725e-07, + "loss": 0.86503971, + "num_input_tokens_seen": 126114845, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.35546875, + "step": 5864, + "time_per_iteration": 3.039716958999634 + }, + { + "auxiliary_loss_clip": 0.01489164, + "auxiliary_loss_mlp": 0.01276436, + "balance_loss_clip": 1.13440394, + "balance_loss_mlp": 1.0376358, + "epoch": 0.7052245535982685, + "flos": 19321138782720.0, + "grad_norm": 2.18006989304351, + "language_loss": 0.89848197, + "learning_rate": 8.44178089306778e-07, + "loss": 0.92613798, + "num_input_tokens_seen": 126132780, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.3828125, + "step": 5865, + "time_per_iteration": 3.1574037075042725 + }, + { + "auxiliary_loss_clip": 0.01485902, + "auxiliary_loss_mlp": 0.01265569, + "balance_loss_clip": 1.12976313, + "balance_loss_mlp": 1.033445, + "epoch": 0.7053447964889076, + "flos": 19064135020320.0, + "grad_norm": 2.492902156542603, + "language_loss": 0.77295256, + "learning_rate": 8.4354245836485e-07, + "loss": 0.80046731, + "num_input_tokens_seen": 126151225, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.31640625, + "step": 5866, + "time_per_iteration": 3.056173801422119 + }, + { + "auxiliary_loss_clip": 0.01485702, + "auxiliary_loss_mlp": 0.01267551, + "balance_loss_clip": 1.12914646, + "balance_loss_mlp": 1.03084874, + "epoch": 0.7054650393795466, + "flos": 27381598076160.0, + "grad_norm": 3.4295195982852977, + "language_loss": 0.73082101, + "learning_rate": 8.429070028566108e-07, + "loss": 0.75835359, + "num_input_tokens_seen": 126172535, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.36328125, + "step": 5867, + "time_per_iteration": 3.093883752822876 + }, + { + "auxiliary_loss_clip": 0.01488212, + "auxiliary_loss_mlp": 0.01275424, + "balance_loss_clip": 1.13396478, + "balance_loss_mlp": 1.04120135, + "epoch": 0.7055852822701858, + "flos": 16103774694240.0, + "grad_norm": 2.1222362967268205, + "language_loss": 0.75209469, + "learning_rate": 8.422717228784586e-07, + "loss": 0.77973104, + "num_input_tokens_seen": 126189410, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.3359375, + "step": 5868, + "time_per_iteration": 3.088334798812866 + }, + { + "auxiliary_loss_clip": 0.01493133, + "auxiliary_loss_mlp": 0.01256602, + "balance_loss_clip": 1.13818383, + "balance_loss_mlp": 1.02295148, + "epoch": 0.7057055251608249, + "flos": 11693899304640.0, + "grad_norm": 1.9559533487966827, + "language_loss": 0.69582629, + "learning_rate": 8.416366185267663e-07, + "loss": 0.72332358, + "num_input_tokens_seen": 126206910, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.33007812, + "step": 5869, + "time_per_iteration": 3.04941725730896 + }, + { + "auxiliary_loss_clip": 0.01483942, + "auxiliary_loss_mlp": 0.01264736, + "balance_loss_clip": 1.12797225, + "balance_loss_mlp": 1.03184855, + "epoch": 0.7058257680514639, + "flos": 22713808648320.0, + "grad_norm": 1.8229011155224109, + "language_loss": 0.77744073, + "learning_rate": 8.410016898978778e-07, + "loss": 0.80492753, + "num_input_tokens_seen": 126224385, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.32617188, + "step": 5870, + "time_per_iteration": 2.9581923484802246 + }, + { + "auxiliary_loss_clip": 0.01481695, + "auxiliary_loss_mlp": 0.01257445, + "balance_loss_clip": 1.12530339, + "balance_loss_mlp": 1.0262742, + "epoch": 0.7059460109421031, + "flos": 17531442773280.0, + "grad_norm": 1.9107115571133666, + "language_loss": 0.7913233, + "learning_rate": 8.403669370881115e-07, + "loss": 0.81871474, + "num_input_tokens_seen": 126243120, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.30664062, + "step": 5871, + "time_per_iteration": 2.9482665061950684 + }, + { + "auxiliary_loss_clip": 0.01488534, + "auxiliary_loss_mlp": 0.01256723, + "balance_loss_clip": 1.13342106, + "balance_loss_mlp": 1.02326322, + "epoch": 0.7060662538327421, + "flos": 23546643564960.0, + "grad_norm": 2.2746688075647508, + "language_loss": 0.78625727, + "learning_rate": 8.397323601937587e-07, + "loss": 0.81370986, + "num_input_tokens_seen": 126263020, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.328125, + "step": 5872, + "time_per_iteration": 3.7530338764190674 + }, + { + "auxiliary_loss_clip": 0.01488273, + "auxiliary_loss_mlp": 0.01267471, + "balance_loss_clip": 1.13253641, + "balance_loss_mlp": 1.03038704, + "epoch": 0.7061864967233812, + "flos": 30263218813440.0, + "grad_norm": 1.8281441748496827, + "language_loss": 0.76983774, + "learning_rate": 8.390979593110838e-07, + "loss": 0.79739517, + "num_input_tokens_seen": 126285150, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.36523438, + "step": 5873, + "time_per_iteration": 3.0183749198913574 + }, + { + "auxiliary_loss_clip": 0.01488402, + "auxiliary_loss_mlp": 0.01266778, + "balance_loss_clip": 1.13203907, + "balance_loss_mlp": 1.02912259, + "epoch": 0.7063067396140204, + "flos": 20703406489920.0, + "grad_norm": 1.6621880824582937, + "language_loss": 0.81607628, + "learning_rate": 8.384637345363262e-07, + "loss": 0.84362805, + "num_input_tokens_seen": 126304340, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.37109375, + "step": 5874, + "time_per_iteration": 3.03305721282959 + }, + { + "auxiliary_loss_clip": 0.01484572, + "auxiliary_loss_mlp": 0.01266359, + "balance_loss_clip": 1.12842178, + "balance_loss_mlp": 1.031564, + "epoch": 0.7064269825046594, + "flos": 32268690311040.0, + "grad_norm": 1.734008272626192, + "language_loss": 0.76871586, + "learning_rate": 8.378296859656964e-07, + "loss": 0.79622519, + "num_input_tokens_seen": 126325495, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.34179688, + "step": 5875, + "time_per_iteration": 3.049776077270508 + }, + { + "auxiliary_loss_clip": 0.01486122, + "auxiliary_loss_mlp": 0.01268059, + "balance_loss_clip": 1.13006067, + "balance_loss_mlp": 1.03421807, + "epoch": 0.7065472253952985, + "flos": 30229765812000.0, + "grad_norm": 2.5700254995751552, + "language_loss": 0.68735623, + "learning_rate": 8.371958136953792e-07, + "loss": 0.71489805, + "num_input_tokens_seen": 126345525, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.33203125, + "step": 5876, + "time_per_iteration": 3.00907826423645 + }, + { + "auxiliary_loss_clip": 0.01491586, + "auxiliary_loss_mlp": 0.01267631, + "balance_loss_clip": 1.13649368, + "balance_loss_mlp": 1.02978444, + "epoch": 0.7066674682859376, + "flos": 16218584327520.0, + "grad_norm": 2.5299064587965594, + "language_loss": 0.66494435, + "learning_rate": 8.365621178215326e-07, + "loss": 0.69253647, + "num_input_tokens_seen": 126361995, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.37304688, + "step": 5877, + "time_per_iteration": 2.940509080886841 + }, + { + "auxiliary_loss_clip": 0.01488392, + "auxiliary_loss_mlp": 0.01260955, + "balance_loss_clip": 1.13340449, + "balance_loss_mlp": 1.02749562, + "epoch": 0.7067877111765767, + "flos": 14832258307200.0, + "grad_norm": 1.9183184712629653, + "language_loss": 0.752994, + "learning_rate": 8.359285984402871e-07, + "loss": 0.78048742, + "num_input_tokens_seen": 126379260, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.33007812, + "step": 5878, + "time_per_iteration": 2.9165632724761963 + }, + { + "auxiliary_loss_clip": 0.01479081, + "auxiliary_loss_mlp": 0.0124744, + "balance_loss_clip": 1.12434793, + "balance_loss_mlp": 1.01455307, + "epoch": 0.7069079540672157, + "flos": 25442160233760.0, + "grad_norm": 2.8281895329780413, + "language_loss": 0.74133217, + "learning_rate": 8.352952556477489e-07, + "loss": 0.76859736, + "num_input_tokens_seen": 126397170, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.32617188, + "step": 5879, + "time_per_iteration": 2.9874937534332275 + }, + { + "auxiliary_loss_clip": 0.01485482, + "auxiliary_loss_mlp": 0.01259061, + "balance_loss_clip": 1.13037086, + "balance_loss_mlp": 1.02407551, + "epoch": 0.7070281969578549, + "flos": 24610045952160.0, + "grad_norm": 1.905272102590053, + "language_loss": 0.76639092, + "learning_rate": 8.34662089539993e-07, + "loss": 0.79383636, + "num_input_tokens_seen": 126416680, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.34375, + "step": 5880, + "time_per_iteration": 2.9643259048461914 + }, + { + "auxiliary_loss_clip": 0.0148704, + "auxiliary_loss_mlp": 0.01260596, + "balance_loss_clip": 1.13222194, + "balance_loss_mlp": 1.02866256, + "epoch": 0.707148439848494, + "flos": 26726913548640.0, + "grad_norm": 2.2992095377879904, + "language_loss": 0.793805, + "learning_rate": 8.340291002130722e-07, + "loss": 0.82128131, + "num_input_tokens_seen": 126435870, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.31445312, + "step": 5881, + "time_per_iteration": 3.0014450550079346 + }, + { + "auxiliary_loss_clip": 0.0148717, + "auxiliary_loss_mlp": 0.01273964, + "balance_loss_clip": 1.1312995, + "balance_loss_mlp": 1.04012334, + "epoch": 0.707268682739133, + "flos": 15087555302400.0, + "grad_norm": 2.5762900759612526, + "language_loss": 0.79680097, + "learning_rate": 8.3339628776301e-07, + "loss": 0.82441223, + "num_input_tokens_seen": 126454010, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3359375, + "step": 5882, + "time_per_iteration": 2.9697964191436768 + }, + { + "auxiliary_loss_clip": 0.01479163, + "auxiliary_loss_mlp": 0.01263329, + "balance_loss_clip": 1.12220132, + "balance_loss_mlp": 1.03120422, + "epoch": 0.7073889256297722, + "flos": 34315996933440.0, + "grad_norm": 2.1295058930911863, + "language_loss": 0.56966281, + "learning_rate": 8.327636522858033e-07, + "loss": 0.59708768, + "num_input_tokens_seen": 126473615, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.31640625, + "step": 5883, + "time_per_iteration": 3.023063898086548 + }, + { + "auxiliary_loss_clip": 0.01484695, + "auxiliary_loss_mlp": 0.01266328, + "balance_loss_clip": 1.12920988, + "balance_loss_mlp": 1.03191531, + "epoch": 0.7075091685204112, + "flos": 20086005708000.0, + "grad_norm": 2.865790662503942, + "language_loss": 0.77421254, + "learning_rate": 8.321311938774225e-07, + "loss": 0.80172276, + "num_input_tokens_seen": 126492705, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.34179688, + "step": 5884, + "time_per_iteration": 3.0156333446502686 + }, + { + "auxiliary_loss_clip": 0.01481482, + "auxiliary_loss_mlp": 0.01255138, + "balance_loss_clip": 1.12517691, + "balance_loss_mlp": 1.0218693, + "epoch": 0.7076294114110503, + "flos": 20779225610400.0, + "grad_norm": 2.6766524643310428, + "language_loss": 0.79541469, + "learning_rate": 8.314989126338104e-07, + "loss": 0.82278085, + "num_input_tokens_seen": 126512715, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.32617188, + "step": 5885, + "time_per_iteration": 3.0613150596618652 + }, + { + "auxiliary_loss_clip": 0.01486134, + "auxiliary_loss_mlp": 0.01261652, + "balance_loss_clip": 1.1302352, + "balance_loss_mlp": 1.02704859, + "epoch": 0.7077496543016895, + "flos": 17969631177600.0, + "grad_norm": 1.8296244741050174, + "language_loss": 0.84486043, + "learning_rate": 8.308668086508847e-07, + "loss": 0.87233829, + "num_input_tokens_seen": 126530795, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.34179688, + "step": 5886, + "time_per_iteration": 3.817336082458496 + }, + { + "auxiliary_loss_clip": 0.0148614, + "auxiliary_loss_mlp": 0.01261812, + "balance_loss_clip": 1.12952209, + "balance_loss_mlp": 1.02606368, + "epoch": 0.7078698971923285, + "flos": 45481589796960.0, + "grad_norm": 1.947035629351915, + "language_loss": 0.73762047, + "learning_rate": 8.302348820245342e-07, + "loss": 0.7651, + "num_input_tokens_seen": 126553360, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3515625, + "step": 5887, + "time_per_iteration": 3.183136224746704 + }, + { + "auxiliary_loss_clip": 0.01484692, + "auxiliary_loss_mlp": 0.01272934, + "balance_loss_clip": 1.12756419, + "balance_loss_mlp": 1.035851, + "epoch": 0.7079901400829676, + "flos": 26946178427520.0, + "grad_norm": 2.4510631415583797, + "language_loss": 0.6993084, + "learning_rate": 8.296031328506232e-07, + "loss": 0.72688472, + "num_input_tokens_seen": 126573110, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.36523438, + "step": 5888, + "time_per_iteration": 2.9389710426330566 + }, + { + "auxiliary_loss_clip": 0.01485719, + "auxiliary_loss_mlp": 0.01260802, + "balance_loss_clip": 1.1298064, + "balance_loss_mlp": 1.02886891, + "epoch": 0.7081103829736067, + "flos": 24425371919520.0, + "grad_norm": 1.8417972859121545, + "language_loss": 0.75523126, + "learning_rate": 8.289715612249857e-07, + "loss": 0.78269649, + "num_input_tokens_seen": 126593725, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.31640625, + "step": 5889, + "time_per_iteration": 3.0528459548950195 + }, + { + "auxiliary_loss_clip": 0.01484284, + "auxiliary_loss_mlp": 0.0127323, + "balance_loss_clip": 1.12796962, + "balance_loss_mlp": 1.0390079, + "epoch": 0.7082306258642458, + "flos": 18544817553120.0, + "grad_norm": 2.9230913605337716, + "language_loss": 0.77513194, + "learning_rate": 8.283401672434305e-07, + "loss": 0.80270702, + "num_input_tokens_seen": 126608950, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.33789062, + "step": 5890, + "time_per_iteration": 4.675900936126709 + }, + { + "auxiliary_loss_clip": 0.01487835, + "auxiliary_loss_mlp": 0.01261584, + "balance_loss_clip": 1.13244987, + "balance_loss_mlp": 1.02869642, + "epoch": 0.7083508687548848, + "flos": 23479813418400.0, + "grad_norm": 1.960516749863166, + "language_loss": 0.70190448, + "learning_rate": 8.277089510017412e-07, + "loss": 0.72939861, + "num_input_tokens_seen": 126629755, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.32421875, + "step": 5891, + "time_per_iteration": 2.943143367767334 + }, + { + "auxiliary_loss_clip": 0.01490246, + "auxiliary_loss_mlp": 0.01264497, + "balance_loss_clip": 1.13503718, + "balance_loss_mlp": 1.02932048, + "epoch": 0.708471111645524, + "flos": 22421683045440.0, + "grad_norm": 1.8180588446226893, + "language_loss": 0.8239755, + "learning_rate": 8.270779125956719e-07, + "loss": 0.85152292, + "num_input_tokens_seen": 126650135, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.34570312, + "step": 5892, + "time_per_iteration": 2.9926841259002686 + }, + { + "auxiliary_loss_clip": 0.01485283, + "auxiliary_loss_mlp": 0.01264653, + "balance_loss_clip": 1.12994885, + "balance_loss_mlp": 1.0308125, + "epoch": 0.7085913545361631, + "flos": 20924984986560.0, + "grad_norm": 2.065228262867321, + "language_loss": 0.80178899, + "learning_rate": 8.264470521209505e-07, + "loss": 0.82928824, + "num_input_tokens_seen": 126668500, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.33203125, + "step": 5893, + "time_per_iteration": 2.9518749713897705 + }, + { + "auxiliary_loss_clip": 0.0148225, + "auxiliary_loss_mlp": 0.0125932, + "balance_loss_clip": 1.12614083, + "balance_loss_mlp": 1.02509809, + "epoch": 0.7087115974268021, + "flos": 15014163584160.0, + "grad_norm": 3.1559805508414622, + "language_loss": 0.76446617, + "learning_rate": 8.258163696732785e-07, + "loss": 0.7918818, + "num_input_tokens_seen": 126686090, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3359375, + "step": 5894, + "time_per_iteration": 2.9419281482696533 + }, + { + "auxiliary_loss_clip": 0.01486854, + "auxiliary_loss_mlp": 0.01260384, + "balance_loss_clip": 1.13134146, + "balance_loss_mlp": 1.02768755, + "epoch": 0.7088318403174413, + "flos": 21540944498400.0, + "grad_norm": 2.0179817775093554, + "language_loss": 0.77058065, + "learning_rate": 8.251858653483288e-07, + "loss": 0.79805303, + "num_input_tokens_seen": 126704255, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.32421875, + "step": 5895, + "time_per_iteration": 2.9568121433258057 + }, + { + "auxiliary_loss_clip": 0.01489139, + "auxiliary_loss_mlp": 0.01275554, + "balance_loss_clip": 1.13277173, + "balance_loss_mlp": 1.04152226, + "epoch": 0.7089520832080803, + "flos": 15518233931040.0, + "grad_norm": 2.572676363449091, + "language_loss": 0.85938954, + "learning_rate": 8.245555392417501e-07, + "loss": 0.8870365, + "num_input_tokens_seen": 126718910, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.3359375, + "step": 5896, + "time_per_iteration": 2.884680986404419 + }, + { + "auxiliary_loss_clip": 0.01481862, + "auxiliary_loss_mlp": 0.01259699, + "balance_loss_clip": 1.12554955, + "balance_loss_mlp": 1.02643013, + "epoch": 0.7090723260987194, + "flos": 20414770277760.0, + "grad_norm": 1.8784090753659015, + "language_loss": 0.79007196, + "learning_rate": 8.239253914491613e-07, + "loss": 0.8174876, + "num_input_tokens_seen": 126737235, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.328125, + "step": 5897, + "time_per_iteration": 2.9444286823272705 + }, + { + "auxiliary_loss_clip": 0.01491523, + "auxiliary_loss_mlp": 0.01257854, + "balance_loss_clip": 1.13675046, + "balance_loss_mlp": 1.02134252, + "epoch": 0.7091925689893585, + "flos": 25670831296320.0, + "grad_norm": 2.3150888360669017, + "language_loss": 0.75198174, + "learning_rate": 8.232954220661556e-07, + "loss": 0.77947545, + "num_input_tokens_seen": 126759970, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.359375, + "step": 5898, + "time_per_iteration": 3.019787549972534 + }, + { + "auxiliary_loss_clip": 0.0149224, + "auxiliary_loss_mlp": 0.01270229, + "balance_loss_clip": 1.13540816, + "balance_loss_mlp": 1.03963065, + "epoch": 0.7093128118799976, + "flos": 24208913724480.0, + "grad_norm": 3.0352086648669467, + "language_loss": 0.70836556, + "learning_rate": 8.226656311882989e-07, + "loss": 0.73599023, + "num_input_tokens_seen": 126779280, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.30273438, + "step": 5899, + "time_per_iteration": 3.8397598266601562 + }, + { + "auxiliary_loss_clip": 0.01487171, + "auxiliary_loss_mlp": 0.01264836, + "balance_loss_clip": 1.13134599, + "balance_loss_mlp": 1.03233027, + "epoch": 0.7094330547706367, + "flos": 16648238895840.0, + "grad_norm": 10.337483387462576, + "language_loss": 0.76796997, + "learning_rate": 8.22036018911129e-07, + "loss": 0.79549003, + "num_input_tokens_seen": 126797310, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.3203125, + "step": 5900, + "time_per_iteration": 2.975020170211792 + }, + { + "auxiliary_loss_clip": 0.01487586, + "auxiliary_loss_mlp": 0.0127945, + "balance_loss_clip": 1.13172054, + "balance_loss_mlp": 1.04160357, + "epoch": 0.7095532976612757, + "flos": 16284883479840.0, + "grad_norm": 2.2059949318361958, + "language_loss": 0.8080219, + "learning_rate": 8.214065853301599e-07, + "loss": 0.83569223, + "num_input_tokens_seen": 126812840, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.37304688, + "step": 5901, + "time_per_iteration": 2.9279720783233643 + }, + { + "auxiliary_loss_clip": 0.01445326, + "auxiliary_loss_mlp": 0.01191925, + "balance_loss_clip": 1.1036067, + "balance_loss_mlp": 1.00080872, + "epoch": 0.7096735405519149, + "flos": 70728797102400.0, + "grad_norm": 0.828006261226321, + "language_loss": 0.58144081, + "learning_rate": 8.207773305408734e-07, + "loss": 0.60781336, + "num_input_tokens_seen": 126880060, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 1.91015625, + "step": 5902, + "time_per_iteration": 3.5214059352874756 + }, + { + "auxiliary_loss_clip": 0.01485532, + "auxiliary_loss_mlp": 0.01269676, + "balance_loss_clip": 1.12863135, + "balance_loss_mlp": 1.03335536, + "epoch": 0.709793783442554, + "flos": 23623676386560.0, + "grad_norm": 1.9709457176624783, + "language_loss": 0.7998758, + "learning_rate": 8.201482546387288e-07, + "loss": 0.82742786, + "num_input_tokens_seen": 126899535, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.35742188, + "step": 5903, + "time_per_iteration": 2.9935202598571777 + }, + { + "auxiliary_loss_clip": 0.01483701, + "auxiliary_loss_mlp": 0.01266681, + "balance_loss_clip": 1.12687302, + "balance_loss_mlp": 1.03341258, + "epoch": 0.709914026333193, + "flos": 25995803050080.0, + "grad_norm": 1.7345236806464637, + "language_loss": 0.92169178, + "learning_rate": 8.195193577191553e-07, + "loss": 0.94919562, + "num_input_tokens_seen": 126921365, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.328125, + "step": 5904, + "time_per_iteration": 3.051790714263916 + }, + { + "auxiliary_loss_clip": 0.0148367, + "auxiliary_loss_mlp": 0.01259418, + "balance_loss_clip": 1.12721801, + "balance_loss_mlp": 1.02538645, + "epoch": 0.7100342692238322, + "flos": 24863749964640.0, + "grad_norm": 1.91008039927988, + "language_loss": 0.84672892, + "learning_rate": 8.188906398775579e-07, + "loss": 0.87415981, + "num_input_tokens_seen": 126941910, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.33398438, + "step": 5905, + "time_per_iteration": 2.9847612380981445 + }, + { + "auxiliary_loss_clip": 0.01485695, + "auxiliary_loss_mlp": 0.01262867, + "balance_loss_clip": 1.12940443, + "balance_loss_mlp": 1.02730942, + "epoch": 0.7101545121144712, + "flos": 24934297070880.0, + "grad_norm": 2.0041343096486295, + "language_loss": 0.69223487, + "learning_rate": 8.18262101209311e-07, + "loss": 0.71972048, + "num_input_tokens_seen": 126961120, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.34960938, + "step": 5906, + "time_per_iteration": 2.983966827392578 + }, + { + "auxiliary_loss_clip": 0.01487182, + "auxiliary_loss_mlp": 0.01263906, + "balance_loss_clip": 1.12998295, + "balance_loss_mlp": 1.02968335, + "epoch": 0.7102747550051103, + "flos": 23771104601760.0, + "grad_norm": 2.31450574722304, + "language_loss": 0.70250469, + "learning_rate": 8.176337418097626e-07, + "loss": 0.73001558, + "num_input_tokens_seen": 126981590, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.33789062, + "step": 5907, + "time_per_iteration": 2.993474245071411 + }, + { + "auxiliary_loss_clip": 0.01483028, + "auxiliary_loss_mlp": 0.01266339, + "balance_loss_clip": 1.12659442, + "balance_loss_mlp": 1.03345227, + "epoch": 0.7103949978957494, + "flos": 15305682336480.0, + "grad_norm": 2.1698471163901734, + "language_loss": 0.79897046, + "learning_rate": 8.170055617742364e-07, + "loss": 0.82646418, + "num_input_tokens_seen": 126998870, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.32421875, + "step": 5908, + "time_per_iteration": 3.0633249282836914 + }, + { + "auxiliary_loss_clip": 0.01488003, + "auxiliary_loss_mlp": 0.01256841, + "balance_loss_clip": 1.13160825, + "balance_loss_mlp": 1.02204633, + "epoch": 0.7105152407863885, + "flos": 22641061708800.0, + "grad_norm": 1.8911434257282638, + "language_loss": 0.70875353, + "learning_rate": 8.163775611980252e-07, + "loss": 0.73620194, + "num_input_tokens_seen": 127017980, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.34375, + "step": 5909, + "time_per_iteration": 2.9819068908691406 + }, + { + "auxiliary_loss_clip": 0.01482031, + "auxiliary_loss_mlp": 0.01264218, + "balance_loss_clip": 1.12524867, + "balance_loss_mlp": 1.03075862, + "epoch": 0.7106354836770276, + "flos": 17240530871520.0, + "grad_norm": 1.9159009961150968, + "language_loss": 0.78513467, + "learning_rate": 8.157497401763982e-07, + "loss": 0.81259716, + "num_input_tokens_seen": 127035645, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.33007812, + "step": 5910, + "time_per_iteration": 2.9156267642974854 + }, + { + "auxiliary_loss_clip": 0.01482155, + "auxiliary_loss_mlp": 0.01267324, + "balance_loss_clip": 1.12596977, + "balance_loss_mlp": 1.03291059, + "epoch": 0.7107557265676667, + "flos": 20195732967840.0, + "grad_norm": 1.7433521372513696, + "language_loss": 0.78420317, + "learning_rate": 8.151220988045935e-07, + "loss": 0.81169796, + "num_input_tokens_seen": 127054900, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.33789062, + "step": 5911, + "time_per_iteration": 2.9560914039611816 + }, + { + "auxiliary_loss_clip": 0.01481004, + "auxiliary_loss_mlp": 0.01263199, + "balance_loss_clip": 1.12509394, + "balance_loss_mlp": 1.03107452, + "epoch": 0.7108759694583058, + "flos": 21509236192320.0, + "grad_norm": 1.8295086660609385, + "language_loss": 0.82948822, + "learning_rate": 8.144946371778234e-07, + "loss": 0.85693026, + "num_input_tokens_seen": 127075010, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.31640625, + "step": 5912, + "time_per_iteration": 2.9404468536376953 + }, + { + "auxiliary_loss_clip": 0.01482788, + "auxiliary_loss_mlp": 0.01264405, + "balance_loss_clip": 1.12535095, + "balance_loss_mlp": 1.02713084, + "epoch": 0.7109962123489448, + "flos": 24064254264960.0, + "grad_norm": 1.9061683896568202, + "language_loss": 0.78305197, + "learning_rate": 8.138673553912751e-07, + "loss": 0.81052393, + "num_input_tokens_seen": 127095570, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.3671875, + "step": 5913, + "time_per_iteration": 3.17853045463562 + }, + { + "auxiliary_loss_clip": 0.01484957, + "auxiliary_loss_mlp": 0.01271342, + "balance_loss_clip": 1.12932992, + "balance_loss_mlp": 1.03406835, + "epoch": 0.711116455239584, + "flos": 30483014686560.0, + "grad_norm": 2.4191420895341262, + "language_loss": 0.56621855, + "learning_rate": 8.132402535401059e-07, + "loss": 0.59378153, + "num_input_tokens_seen": 127116825, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.3671875, + "step": 5914, + "time_per_iteration": 4.0233800411224365 + }, + { + "auxiliary_loss_clip": 0.01481192, + "auxiliary_loss_mlp": 0.01265686, + "balance_loss_clip": 1.12531304, + "balance_loss_mlp": 1.03165483, + "epoch": 0.711236698130223, + "flos": 25047930931200.0, + "grad_norm": 1.7724223822514713, + "language_loss": 0.74415612, + "learning_rate": 8.126133317194465e-07, + "loss": 0.77162492, + "num_input_tokens_seen": 127137015, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.33398438, + "step": 5915, + "time_per_iteration": 2.9708855152130127 + }, + { + "auxiliary_loss_clip": 0.01483209, + "auxiliary_loss_mlp": 0.01261313, + "balance_loss_clip": 1.12673807, + "balance_loss_mlp": 1.02575529, + "epoch": 0.7113569410208621, + "flos": 24208989580800.0, + "grad_norm": 1.9116063578717692, + "language_loss": 0.74401498, + "learning_rate": 8.11986590024401e-07, + "loss": 0.77146024, + "num_input_tokens_seen": 127156755, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.34960938, + "step": 5916, + "time_per_iteration": 3.960700750350952 + }, + { + "auxiliary_loss_clip": 0.01482128, + "auxiliary_loss_mlp": 0.01261855, + "balance_loss_clip": 1.12462199, + "balance_loss_mlp": 1.0251534, + "epoch": 0.7114771839115013, + "flos": 35441943585120.0, + "grad_norm": 1.815938081398321, + "language_loss": 0.68970692, + "learning_rate": 8.113600285500442e-07, + "loss": 0.71714675, + "num_input_tokens_seen": 127176965, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.36132812, + "step": 5917, + "time_per_iteration": 3.9558541774749756 + }, + { + "auxiliary_loss_clip": 0.01483974, + "auxiliary_loss_mlp": 0.01259064, + "balance_loss_clip": 1.12839496, + "balance_loss_mlp": 1.02407837, + "epoch": 0.7115974268021403, + "flos": 21101238967680.0, + "grad_norm": 1.814401224216289, + "language_loss": 0.74435496, + "learning_rate": 8.107336473914268e-07, + "loss": 0.77178538, + "num_input_tokens_seen": 127195595, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.34570312, + "step": 5918, + "time_per_iteration": 3.0488200187683105 + }, + { + "auxiliary_loss_clip": 0.01444089, + "auxiliary_loss_mlp": 0.01198303, + "balance_loss_clip": 1.10333061, + "balance_loss_mlp": 1.00756836, + "epoch": 0.7117176696927794, + "flos": 56759071461120.0, + "grad_norm": 0.8193990758713878, + "language_loss": 0.55677044, + "learning_rate": 8.101074466435694e-07, + "loss": 0.58319438, + "num_input_tokens_seen": 127255070, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.90625, + "step": 5919, + "time_per_iteration": 3.4442176818847656 + }, + { + "auxiliary_loss_clip": 0.0148765, + "auxiliary_loss_mlp": 0.01266748, + "balance_loss_clip": 1.13273335, + "balance_loss_mlp": 1.02947354, + "epoch": 0.7118379125834186, + "flos": 15927141431520.0, + "grad_norm": 2.0751176217674177, + "language_loss": 0.68008554, + "learning_rate": 8.094814264014662e-07, + "loss": 0.70762956, + "num_input_tokens_seen": 127273825, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.3671875, + "step": 5920, + "time_per_iteration": 3.0637741088867188 + }, + { + "auxiliary_loss_clip": 0.01482262, + "auxiliary_loss_mlp": 0.01264785, + "balance_loss_clip": 1.12548852, + "balance_loss_mlp": 1.030563, + "epoch": 0.7119581554740576, + "flos": 20195429542560.0, + "grad_norm": 1.961627536322737, + "language_loss": 0.81356871, + "learning_rate": 8.088555867600844e-07, + "loss": 0.84103918, + "num_input_tokens_seen": 127289990, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.33984375, + "step": 5921, + "time_per_iteration": 2.9908719062805176 + }, + { + "auxiliary_loss_clip": 0.01492623, + "auxiliary_loss_mlp": 0.01258045, + "balance_loss_clip": 1.13620853, + "balance_loss_mlp": 1.02782822, + "epoch": 0.7120783983646967, + "flos": 34718304934080.0, + "grad_norm": 2.887430829494895, + "language_loss": 0.60408223, + "learning_rate": 8.08229927814362e-07, + "loss": 0.63158894, + "num_input_tokens_seen": 127312880, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.29882812, + "step": 5922, + "time_per_iteration": 3.121943235397339 + }, + { + "auxiliary_loss_clip": 0.01483037, + "auxiliary_loss_mlp": 0.01261234, + "balance_loss_clip": 1.12711382, + "balance_loss_mlp": 1.02853751, + "epoch": 0.7121986412553358, + "flos": 26361510012000.0, + "grad_norm": 2.025008825657294, + "language_loss": 0.65281832, + "learning_rate": 8.076044496592134e-07, + "loss": 0.68026102, + "num_input_tokens_seen": 127334730, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.32421875, + "step": 5923, + "time_per_iteration": 3.1140453815460205 + }, + { + "auxiliary_loss_clip": 0.01486625, + "auxiliary_loss_mlp": 0.0127219, + "balance_loss_clip": 1.12985396, + "balance_loss_mlp": 1.03987503, + "epoch": 0.7123188841459749, + "flos": 11146931844480.0, + "grad_norm": 2.322318735540366, + "language_loss": 0.77652735, + "learning_rate": 8.069791523895204e-07, + "loss": 0.80411547, + "num_input_tokens_seen": 127351180, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.31640625, + "step": 5924, + "time_per_iteration": 3.005018949508667 + }, + { + "auxiliary_loss_clip": 0.01485459, + "auxiliary_loss_mlp": 0.01256594, + "balance_loss_clip": 1.12842441, + "balance_loss_mlp": 1.0235157, + "epoch": 0.7124391270366139, + "flos": 20813664744000.0, + "grad_norm": 2.7082341822582534, + "language_loss": 0.77657771, + "learning_rate": 8.063540361001422e-07, + "loss": 0.80399823, + "num_input_tokens_seen": 127369750, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.33007812, + "step": 5925, + "time_per_iteration": 2.975637435913086 + }, + { + "auxiliary_loss_clip": 0.014859, + "auxiliary_loss_mlp": 0.01271294, + "balance_loss_clip": 1.12968004, + "balance_loss_mlp": 1.03630912, + "epoch": 0.7125593699272531, + "flos": 17605706839200.0, + "grad_norm": 6.655783305538062, + "language_loss": 0.79383075, + "learning_rate": 8.057291008859069e-07, + "loss": 0.82140267, + "num_input_tokens_seen": 127387910, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.34375, + "step": 5926, + "time_per_iteration": 3.780390501022339 + }, + { + "auxiliary_loss_clip": 0.01484769, + "auxiliary_loss_mlp": 0.01256837, + "balance_loss_clip": 1.12868965, + "balance_loss_mlp": 1.02337766, + "epoch": 0.7126796128178922, + "flos": 28656376284960.0, + "grad_norm": 2.2211895246462285, + "language_loss": 0.68432951, + "learning_rate": 8.051043468416187e-07, + "loss": 0.71174556, + "num_input_tokens_seen": 127409160, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.33007812, + "step": 5927, + "time_per_iteration": 3.044311046600342 + }, + { + "auxiliary_loss_clip": 0.0149224, + "auxiliary_loss_mlp": 0.01263306, + "balance_loss_clip": 1.13680172, + "balance_loss_mlp": 1.03270793, + "epoch": 0.7127998557085312, + "flos": 16036299768960.0, + "grad_norm": 2.1789460781460077, + "language_loss": 0.8219313, + "learning_rate": 8.044797740620506e-07, + "loss": 0.84948671, + "num_input_tokens_seen": 127427765, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.3046875, + "step": 5928, + "time_per_iteration": 2.995760679244995 + }, + { + "auxiliary_loss_clip": 0.0148724, + "auxiliary_loss_mlp": 0.01270268, + "balance_loss_clip": 1.13067353, + "balance_loss_mlp": 1.03623617, + "epoch": 0.7129200985991703, + "flos": 23405473496160.0, + "grad_norm": 2.8682669118400748, + "language_loss": 0.78797829, + "learning_rate": 8.038553826419494e-07, + "loss": 0.81555331, + "num_input_tokens_seen": 127446475, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.33398438, + "step": 5929, + "time_per_iteration": 2.9502007961273193 + }, + { + "auxiliary_loss_clip": 0.01481157, + "auxiliary_loss_mlp": 0.01258962, + "balance_loss_clip": 1.12553549, + "balance_loss_mlp": 1.02416754, + "epoch": 0.7130403414898094, + "flos": 21399584788800.0, + "grad_norm": 2.0665958353570195, + "language_loss": 0.81208694, + "learning_rate": 8.032311726760364e-07, + "loss": 0.83948815, + "num_input_tokens_seen": 127467695, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.34179688, + "step": 5930, + "time_per_iteration": 3.0266029834747314 + }, + { + "auxiliary_loss_clip": 0.01483484, + "auxiliary_loss_mlp": 0.01271829, + "balance_loss_clip": 1.12642467, + "balance_loss_mlp": 1.03493619, + "epoch": 0.7131605843804485, + "flos": 74744140128480.0, + "grad_norm": 1.9001695837392554, + "language_loss": 0.68620718, + "learning_rate": 8.026071442590022e-07, + "loss": 0.71376026, + "num_input_tokens_seen": 127494590, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.36523438, + "step": 5931, + "time_per_iteration": 3.4301867485046387 + }, + { + "auxiliary_loss_clip": 0.01492491, + "auxiliary_loss_mlp": 0.01266343, + "balance_loss_clip": 1.13759327, + "balance_loss_mlp": 1.0330739, + "epoch": 0.7132808272710875, + "flos": 18370687548960.0, + "grad_norm": 2.176356131185899, + "language_loss": 0.80738795, + "learning_rate": 8.019832974855134e-07, + "loss": 0.83497632, + "num_input_tokens_seen": 127512550, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.328125, + "step": 5932, + "time_per_iteration": 3.0015594959259033 + }, + { + "auxiliary_loss_clip": 0.01489854, + "auxiliary_loss_mlp": 0.01264277, + "balance_loss_clip": 1.13318884, + "balance_loss_mlp": 1.03138959, + "epoch": 0.7134010701617267, + "flos": 23255352381600.0, + "grad_norm": 2.2395119084473953, + "language_loss": 0.82654983, + "learning_rate": 8.013596324502052e-07, + "loss": 0.85409117, + "num_input_tokens_seen": 127531015, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.32226562, + "step": 5933, + "time_per_iteration": 3.097752094268799 + }, + { + "auxiliary_loss_clip": 0.01483195, + "auxiliary_loss_mlp": 0.01256518, + "balance_loss_clip": 1.12658978, + "balance_loss_mlp": 1.02630091, + "epoch": 0.7135213130523658, + "flos": 23655043339200.0, + "grad_norm": 1.7953604551064666, + "language_loss": 0.78798604, + "learning_rate": 8.007361492476872e-07, + "loss": 0.81538314, + "num_input_tokens_seen": 127550340, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.29882812, + "step": 5934, + "time_per_iteration": 3.0096662044525146 + }, + { + "auxiliary_loss_clip": 0.01487555, + "auxiliary_loss_mlp": 0.01261893, + "balance_loss_clip": 1.13384473, + "balance_loss_mlp": 1.0282433, + "epoch": 0.7136415559430048, + "flos": 24792975289440.0, + "grad_norm": 1.6504981486922832, + "language_loss": 0.79178154, + "learning_rate": 8.001128479725426e-07, + "loss": 0.81927598, + "num_input_tokens_seen": 127572245, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.33007812, + "step": 5935, + "time_per_iteration": 3.0865511894226074 + }, + { + "auxiliary_loss_clip": 0.01484032, + "auxiliary_loss_mlp": 0.01260719, + "balance_loss_clip": 1.12818384, + "balance_loss_mlp": 1.02630579, + "epoch": 0.713761798833644, + "flos": 18298964669760.0, + "grad_norm": 1.592179359418797, + "language_loss": 0.80873173, + "learning_rate": 7.994897287193248e-07, + "loss": 0.83617926, + "num_input_tokens_seen": 127591625, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.33789062, + "step": 5936, + "time_per_iteration": 3.028640031814575 + }, + { + "auxiliary_loss_clip": 0.0148415, + "auxiliary_loss_mlp": 0.0125674, + "balance_loss_clip": 1.12876654, + "balance_loss_mlp": 1.02137303, + "epoch": 0.713882041724283, + "flos": 15559993199520.0, + "grad_norm": 3.0105605958327115, + "language_loss": 0.83670956, + "learning_rate": 7.988667915825605e-07, + "loss": 0.86411846, + "num_input_tokens_seen": 127608690, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.34765625, + "step": 5937, + "time_per_iteration": 3.052932024002075 + }, + { + "auxiliary_loss_clip": 0.01494634, + "auxiliary_loss_mlp": 0.01262255, + "balance_loss_clip": 1.14008451, + "balance_loss_mlp": 1.02612495, + "epoch": 0.7140022846149221, + "flos": 24063306060960.0, + "grad_norm": 2.0865085193447337, + "language_loss": 0.75799185, + "learning_rate": 7.982440366567491e-07, + "loss": 0.78556073, + "num_input_tokens_seen": 127627180, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.35742188, + "step": 5938, + "time_per_iteration": 3.058166742324829 + }, + { + "auxiliary_loss_clip": 0.0148699, + "auxiliary_loss_mlp": 0.01267636, + "balance_loss_clip": 1.1315577, + "balance_loss_mlp": 1.03341365, + "epoch": 0.7141225275055613, + "flos": 27894202259040.0, + "grad_norm": 1.5796446175691032, + "language_loss": 0.7519021, + "learning_rate": 7.97621464036361e-07, + "loss": 0.77944839, + "num_input_tokens_seen": 127648940, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.3359375, + "step": 5939, + "time_per_iteration": 3.116694211959839 + }, + { + "auxiliary_loss_clip": 0.01487053, + "auxiliary_loss_mlp": 0.01263192, + "balance_loss_clip": 1.13041985, + "balance_loss_mlp": 1.02763438, + "epoch": 0.7142427703962003, + "flos": 19684645911360.0, + "grad_norm": 1.6741392823917314, + "language_loss": 0.68684131, + "learning_rate": 7.969990738158417e-07, + "loss": 0.71434379, + "num_input_tokens_seen": 127667350, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.34960938, + "step": 5940, + "time_per_iteration": 3.1520564556121826 + }, + { + "auxiliary_loss_clip": 0.01488402, + "auxiliary_loss_mlp": 0.01265388, + "balance_loss_clip": 1.13222551, + "balance_loss_mlp": 1.03002131, + "epoch": 0.7143630132868394, + "flos": 21034484677440.0, + "grad_norm": 2.2158817026814432, + "language_loss": 0.85403204, + "learning_rate": 7.963768660896062e-07, + "loss": 0.88156998, + "num_input_tokens_seen": 127685760, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.34765625, + "step": 5941, + "time_per_iteration": 3.815864086151123 + }, + { + "auxiliary_loss_clip": 0.01493527, + "auxiliary_loss_mlp": 0.01256428, + "balance_loss_clip": 1.13718796, + "balance_loss_mlp": 1.02659249, + "epoch": 0.7144832561774785, + "flos": 24131994687360.0, + "grad_norm": 1.999851099601382, + "language_loss": 0.82780612, + "learning_rate": 7.957548409520432e-07, + "loss": 0.85530561, + "num_input_tokens_seen": 127704985, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.296875, + "step": 5942, + "time_per_iteration": 3.0672147274017334 + }, + { + "auxiliary_loss_clip": 0.01482105, + "auxiliary_loss_mlp": 0.01258015, + "balance_loss_clip": 1.1255101, + "balance_loss_mlp": 1.02531886, + "epoch": 0.7146034990681176, + "flos": 16327666808640.0, + "grad_norm": 2.9491091023518385, + "language_loss": 0.84026849, + "learning_rate": 7.951329984975135e-07, + "loss": 0.8676697, + "num_input_tokens_seen": 127721925, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.32421875, + "step": 5943, + "time_per_iteration": 3.1873462200164795 + }, + { + "auxiliary_loss_clip": 0.01447252, + "auxiliary_loss_mlp": 0.01191032, + "balance_loss_clip": 1.10755336, + "balance_loss_mlp": 1.00144196, + "epoch": 0.7147237419587567, + "flos": 69633800193600.0, + "grad_norm": 0.7137937110420208, + "language_loss": 0.54255581, + "learning_rate": 7.94511338820349e-07, + "loss": 0.56893867, + "num_input_tokens_seen": 127784230, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.89453125, + "step": 5944, + "time_per_iteration": 4.473188638687134 + }, + { + "auxiliary_loss_clip": 0.01485329, + "auxiliary_loss_mlp": 0.01272239, + "balance_loss_clip": 1.12929034, + "balance_loss_mlp": 1.03668094, + "epoch": 0.7148439848493958, + "flos": 22268641462560.0, + "grad_norm": 2.366668787896998, + "language_loss": 0.78253311, + "learning_rate": 7.938898620148575e-07, + "loss": 0.81010878, + "num_input_tokens_seen": 127801990, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.34960938, + "step": 5945, + "time_per_iteration": 4.040475130081177 + }, + { + "auxiliary_loss_clip": 0.0148702, + "auxiliary_loss_mlp": 0.01264739, + "balance_loss_clip": 1.13108051, + "balance_loss_mlp": 1.0312798, + "epoch": 0.7149642277400349, + "flos": 17933333564160.0, + "grad_norm": 1.9222305588354651, + "language_loss": 0.71246189, + "learning_rate": 7.932685681753135e-07, + "loss": 0.73997951, + "num_input_tokens_seen": 127819270, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.328125, + "step": 5946, + "time_per_iteration": 3.1096062660217285 + }, + { + "auxiliary_loss_clip": 0.01493519, + "auxiliary_loss_mlp": 0.0125454, + "balance_loss_clip": 1.13821054, + "balance_loss_mlp": 1.02298784, + "epoch": 0.7150844706306739, + "flos": 31684401177120.0, + "grad_norm": 16.30653844003597, + "language_loss": 0.62676644, + "learning_rate": 7.92647457395969e-07, + "loss": 0.65424705, + "num_input_tokens_seen": 127841095, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.30859375, + "step": 5947, + "time_per_iteration": 3.1119823455810547 + }, + { + "auxiliary_loss_clip": 0.01490001, + "auxiliary_loss_mlp": 0.0127186, + "balance_loss_clip": 1.1350596, + "balance_loss_mlp": 1.03706551, + "epoch": 0.7152047135213131, + "flos": 10927780750080.0, + "grad_norm": 2.658994119290891, + "language_loss": 0.73918378, + "learning_rate": 7.920265297710444e-07, + "loss": 0.76680231, + "num_input_tokens_seen": 127858485, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.34375, + "step": 5948, + "time_per_iteration": 3.1100759506225586 + }, + { + "auxiliary_loss_clip": 0.01489554, + "auxiliary_loss_mlp": 0.01267748, + "balance_loss_clip": 1.13405418, + "balance_loss_mlp": 1.03505135, + "epoch": 0.7153249564119522, + "flos": 20997731926080.0, + "grad_norm": 2.1148588101728683, + "language_loss": 0.73535955, + "learning_rate": 7.914057853947363e-07, + "loss": 0.76293254, + "num_input_tokens_seen": 127877665, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.32226562, + "step": 5949, + "time_per_iteration": 3.0032753944396973 + }, + { + "auxiliary_loss_clip": 0.01487651, + "auxiliary_loss_mlp": 0.01257263, + "balance_loss_clip": 1.13188481, + "balance_loss_mlp": 1.01960754, + "epoch": 0.7154451993025912, + "flos": 24245400978720.0, + "grad_norm": 1.9992731511023782, + "language_loss": 0.62593037, + "learning_rate": 7.907852243612089e-07, + "loss": 0.65337956, + "num_input_tokens_seen": 127898070, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.37109375, + "step": 5950, + "time_per_iteration": 3.0526766777038574 + }, + { + "auxiliary_loss_clip": 0.01489508, + "auxiliary_loss_mlp": 0.01259873, + "balance_loss_clip": 1.13529086, + "balance_loss_mlp": 1.02545977, + "epoch": 0.7155654421932304, + "flos": 23333181694560.0, + "grad_norm": 2.099751395078285, + "language_loss": 0.72562385, + "learning_rate": 7.901648467646009e-07, + "loss": 0.75311768, + "num_input_tokens_seen": 127917010, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.33984375, + "step": 5951, + "time_per_iteration": 3.0052294731140137 + }, + { + "auxiliary_loss_clip": 0.0149074, + "auxiliary_loss_mlp": 0.0126772, + "balance_loss_clip": 1.13499618, + "balance_loss_mlp": 1.03483284, + "epoch": 0.7156856850838694, + "flos": 22714150001760.0, + "grad_norm": 1.6512618290200654, + "language_loss": 0.72661936, + "learning_rate": 7.895446526990244e-07, + "loss": 0.75420403, + "num_input_tokens_seen": 127937025, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.32421875, + "step": 5952, + "time_per_iteration": 3.1373636722564697 + }, + { + "auxiliary_loss_clip": 0.01492936, + "auxiliary_loss_mlp": 0.01268925, + "balance_loss_clip": 1.1383462, + "balance_loss_mlp": 1.0337491, + "epoch": 0.7158059279745085, + "flos": 19867499392320.0, + "grad_norm": 1.841543815218066, + "language_loss": 0.75895751, + "learning_rate": 7.889246422585609e-07, + "loss": 0.78657603, + "num_input_tokens_seen": 127956410, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.34570312, + "step": 5953, + "time_per_iteration": 3.1469554901123047 + }, + { + "auxiliary_loss_clip": 0.01488297, + "auxiliary_loss_mlp": 0.01270088, + "balance_loss_clip": 1.13276196, + "balance_loss_mlp": 1.04044342, + "epoch": 0.7159261708651476, + "flos": 24137114988960.0, + "grad_norm": 1.9004403239453294, + "language_loss": 0.73791754, + "learning_rate": 7.883048155372675e-07, + "loss": 0.76550144, + "num_input_tokens_seen": 127974925, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.296875, + "step": 5954, + "time_per_iteration": 3.9285175800323486 + }, + { + "auxiliary_loss_clip": 0.0149209, + "auxiliary_loss_mlp": 0.01261792, + "balance_loss_clip": 1.13750148, + "balance_loss_mlp": 1.02985835, + "epoch": 0.7160464137557867, + "flos": 16985120091840.0, + "grad_norm": 2.472076653092895, + "language_loss": 0.71624672, + "learning_rate": 7.876851726291698e-07, + "loss": 0.74378556, + "num_input_tokens_seen": 127993225, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.31445312, + "step": 5955, + "time_per_iteration": 3.014263391494751 + }, + { + "auxiliary_loss_clip": 0.01490449, + "auxiliary_loss_mlp": 0.01259046, + "balance_loss_clip": 1.13544774, + "balance_loss_mlp": 1.02978289, + "epoch": 0.7161666566464258, + "flos": 25230594771360.0, + "grad_norm": 1.9752947883995406, + "language_loss": 0.78812021, + "learning_rate": 7.870657136282666e-07, + "loss": 0.81561518, + "num_input_tokens_seen": 128012085, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.29296875, + "step": 5956, + "time_per_iteration": 3.1088674068450928 + }, + { + "auxiliary_loss_clip": 0.01492115, + "auxiliary_loss_mlp": 0.01261723, + "balance_loss_clip": 1.13701451, + "balance_loss_mlp": 1.02902675, + "epoch": 0.7162868995370649, + "flos": 26470782133920.0, + "grad_norm": 2.0878116515938783, + "language_loss": 0.81929761, + "learning_rate": 7.86446438628531e-07, + "loss": 0.84683597, + "num_input_tokens_seen": 128033155, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.32226562, + "step": 5957, + "time_per_iteration": 3.0951929092407227 + }, + { + "auxiliary_loss_clip": 0.01448491, + "auxiliary_loss_mlp": 0.01195816, + "balance_loss_clip": 1.10985231, + "balance_loss_mlp": 1.0035553, + "epoch": 0.716407142427704, + "flos": 70005954942720.0, + "grad_norm": 0.7840121502650644, + "language_loss": 0.56840861, + "learning_rate": 7.858273477239059e-07, + "loss": 0.59485173, + "num_input_tokens_seen": 128101575, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 1.921875, + "step": 5958, + "time_per_iteration": 3.503927230834961 + }, + { + "auxiliary_loss_clip": 0.01493161, + "auxiliary_loss_mlp": 0.01261111, + "balance_loss_clip": 1.1380254, + "balance_loss_mlp": 1.02555394, + "epoch": 0.716527385318343, + "flos": 20742548715360.0, + "grad_norm": 1.7997457369598167, + "language_loss": 0.71506608, + "learning_rate": 7.852084410083067e-07, + "loss": 0.74260885, + "num_input_tokens_seen": 128120395, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.34960938, + "step": 5959, + "time_per_iteration": 3.0200467109680176 + }, + { + "auxiliary_loss_clip": 0.01488971, + "auxiliary_loss_mlp": 0.01261198, + "balance_loss_clip": 1.13329434, + "balance_loss_mlp": 1.02964592, + "epoch": 0.7166476282089821, + "flos": 25374192242400.0, + "grad_norm": 1.5483113795848182, + "language_loss": 0.63932109, + "learning_rate": 7.84589718575621e-07, + "loss": 0.66682279, + "num_input_tokens_seen": 128140840, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.30859375, + "step": 5960, + "time_per_iteration": 2.9860033988952637 + }, + { + "auxiliary_loss_clip": 0.01492481, + "auxiliary_loss_mlp": 0.01259856, + "balance_loss_clip": 1.13726425, + "balance_loss_mlp": 1.02620625, + "epoch": 0.7167678710996213, + "flos": 24136015072320.0, + "grad_norm": 2.5272387882751413, + "language_loss": 0.69452477, + "learning_rate": 7.83971180519708e-07, + "loss": 0.72204816, + "num_input_tokens_seen": 128159695, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.33007812, + "step": 5961, + "time_per_iteration": 3.0751683712005615 + }, + { + "auxiliary_loss_clip": 0.01502145, + "auxiliary_loss_mlp": 0.01274981, + "balance_loss_clip": 1.14725101, + "balance_loss_mlp": 1.04190254, + "epoch": 0.7168881139902603, + "flos": 30229196889600.0, + "grad_norm": 2.1969578150235076, + "language_loss": 0.75480497, + "learning_rate": 7.833528269344008e-07, + "loss": 0.7825762, + "num_input_tokens_seen": 128179600, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.328125, + "step": 5962, + "time_per_iteration": 3.1903305053710938 + }, + { + "auxiliary_loss_clip": 0.01497518, + "auxiliary_loss_mlp": 0.01274084, + "balance_loss_clip": 1.14193904, + "balance_loss_mlp": 1.040815, + "epoch": 0.7170083568808994, + "flos": 14607911054880.0, + "grad_norm": 2.804349674852748, + "language_loss": 0.77738714, + "learning_rate": 7.827346579135023e-07, + "loss": 0.80510312, + "num_input_tokens_seen": 128196940, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.33007812, + "step": 5963, + "time_per_iteration": 2.95841646194458 + }, + { + "auxiliary_loss_clip": 0.01494282, + "auxiliary_loss_mlp": 0.01269903, + "balance_loss_clip": 1.1400311, + "balance_loss_mlp": 1.03815985, + "epoch": 0.7171285997715385, + "flos": 23333333407200.0, + "grad_norm": 2.875184559991175, + "language_loss": 0.83150536, + "learning_rate": 7.821166735507885e-07, + "loss": 0.85914719, + "num_input_tokens_seen": 128215970, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.3125, + "step": 5964, + "time_per_iteration": 3.1297247409820557 + }, + { + "auxiliary_loss_clip": 0.01492258, + "auxiliary_loss_mlp": 0.01261042, + "balance_loss_clip": 1.13655555, + "balance_loss_mlp": 1.02948999, + "epoch": 0.7172488426621776, + "flos": 16545338704800.0, + "grad_norm": 1.8291441539015127, + "language_loss": 0.68808174, + "learning_rate": 7.81498873940007e-07, + "loss": 0.7156148, + "num_input_tokens_seen": 128233185, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.31054688, + "step": 5965, + "time_per_iteration": 3.0788662433624268 + }, + { + "auxiliary_loss_clip": 0.01484703, + "auxiliary_loss_mlp": 0.01270146, + "balance_loss_clip": 1.12821996, + "balance_loss_mlp": 1.03268123, + "epoch": 0.7173690855528166, + "flos": 26544022139520.0, + "grad_norm": 2.709225855834035, + "language_loss": 0.77478051, + "learning_rate": 7.808812591748768e-07, + "loss": 0.802329, + "num_input_tokens_seen": 128253565, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.36914062, + "step": 5966, + "time_per_iteration": 3.1666910648345947 + }, + { + "auxiliary_loss_clip": 0.01490041, + "auxiliary_loss_mlp": 0.01268451, + "balance_loss_clip": 1.1348269, + "balance_loss_mlp": 1.03251231, + "epoch": 0.7174893284434558, + "flos": 22786328018880.0, + "grad_norm": 2.1498078416088746, + "language_loss": 0.65018362, + "learning_rate": 7.802638293490915e-07, + "loss": 0.67776853, + "num_input_tokens_seen": 128273210, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.35351562, + "step": 5967, + "time_per_iteration": 2.9668214321136475 + }, + { + "auxiliary_loss_clip": 0.01496351, + "auxiliary_loss_mlp": 0.01264616, + "balance_loss_clip": 1.14031935, + "balance_loss_mlp": 1.03191912, + "epoch": 0.7176095713340949, + "flos": 23295746236320.0, + "grad_norm": 2.183717523769097, + "language_loss": 0.77422774, + "learning_rate": 7.796465845563123e-07, + "loss": 0.80183744, + "num_input_tokens_seen": 128292085, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.32421875, + "step": 5968, + "time_per_iteration": 2.934461832046509 + }, + { + "auxiliary_loss_clip": 0.01493805, + "auxiliary_loss_mlp": 0.01259944, + "balance_loss_clip": 1.13789773, + "balance_loss_mlp": 1.02953649, + "epoch": 0.7177298142247339, + "flos": 25593912259200.0, + "grad_norm": 2.0852253269776635, + "language_loss": 0.79612923, + "learning_rate": 7.790295248901766e-07, + "loss": 0.82366669, + "num_input_tokens_seen": 128313215, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.30078125, + "step": 5969, + "time_per_iteration": 3.8524158000946045 + }, + { + "auxiliary_loss_clip": 0.01494758, + "auxiliary_loss_mlp": 0.01266514, + "balance_loss_clip": 1.13968182, + "balance_loss_mlp": 1.03419864, + "epoch": 0.7178500571153731, + "flos": 31655651267520.0, + "grad_norm": 2.038044634879335, + "language_loss": 0.62531954, + "learning_rate": 7.784126504442902e-07, + "loss": 0.65293229, + "num_input_tokens_seen": 128336445, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.31835938, + "step": 5970, + "time_per_iteration": 3.053504228591919 + }, + { + "auxiliary_loss_clip": 0.01487324, + "auxiliary_loss_mlp": 0.0126299, + "balance_loss_clip": 1.13184214, + "balance_loss_mlp": 1.03048408, + "epoch": 0.7179703000060121, + "flos": 19429273059840.0, + "grad_norm": 1.4255616369592117, + "language_loss": 0.68382287, + "learning_rate": 7.777959613122351e-07, + "loss": 0.711326, + "num_input_tokens_seen": 128356270, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.3203125, + "step": 5971, + "time_per_iteration": 2.9856529235839844 + }, + { + "auxiliary_loss_clip": 0.01490427, + "auxiliary_loss_mlp": 0.0126485, + "balance_loss_clip": 1.13595295, + "balance_loss_mlp": 1.03482366, + "epoch": 0.7180905428966512, + "flos": 28841733024480.0, + "grad_norm": 1.7976297666429193, + "language_loss": 0.78076285, + "learning_rate": 7.771794575875604e-07, + "loss": 0.80831563, + "num_input_tokens_seen": 128378140, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.296875, + "step": 5972, + "time_per_iteration": 4.782358884811401 + }, + { + "auxiliary_loss_clip": 0.0149515, + "auxiliary_loss_mlp": 0.01262172, + "balance_loss_clip": 1.14075375, + "balance_loss_mlp": 1.02661443, + "epoch": 0.7182107857872904, + "flos": 20049632238240.0, + "grad_norm": 2.189810219082016, + "language_loss": 0.77326316, + "learning_rate": 7.765631393637888e-07, + "loss": 0.80083638, + "num_input_tokens_seen": 128396335, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.34960938, + "step": 5973, + "time_per_iteration": 2.9490878582000732 + }, + { + "auxiliary_loss_clip": 0.01489042, + "auxiliary_loss_mlp": 0.01270387, + "balance_loss_clip": 1.13303697, + "balance_loss_mlp": 1.03406644, + "epoch": 0.7183310286779294, + "flos": 22749916620960.0, + "grad_norm": 3.2971772505364663, + "language_loss": 0.48589611, + "learning_rate": 7.75947006734417e-07, + "loss": 0.51349038, + "num_input_tokens_seen": 128414115, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.35742188, + "step": 5974, + "time_per_iteration": 2.936368465423584 + }, + { + "auxiliary_loss_clip": 0.01486978, + "auxiliary_loss_mlp": 0.012609, + "balance_loss_clip": 1.13230085, + "balance_loss_mlp": 1.02896619, + "epoch": 0.7184512715685685, + "flos": 17159781090240.0, + "grad_norm": 2.1119266702302695, + "language_loss": 0.82981509, + "learning_rate": 7.753310597929101e-07, + "loss": 0.85729384, + "num_input_tokens_seen": 128430755, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.3125, + "step": 5975, + "time_per_iteration": 3.0062832832336426 + }, + { + "auxiliary_loss_clip": 0.01449019, + "auxiliary_loss_mlp": 0.01192787, + "balance_loss_clip": 1.11032367, + "balance_loss_mlp": 1.00167084, + "epoch": 0.7185715144592076, + "flos": 65516315904000.0, + "grad_norm": 0.7699162477489742, + "language_loss": 0.55091554, + "learning_rate": 7.747152986327095e-07, + "loss": 0.57733363, + "num_input_tokens_seen": 128491300, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 1.91015625, + "step": 5976, + "time_per_iteration": 3.405298948287964 + }, + { + "auxiliary_loss_clip": 0.01489714, + "auxiliary_loss_mlp": 0.01262472, + "balance_loss_clip": 1.1347084, + "balance_loss_mlp": 1.03225493, + "epoch": 0.7186917573498467, + "flos": 16182400498560.0, + "grad_norm": 1.9299652287989868, + "language_loss": 0.68030643, + "learning_rate": 7.740997233472228e-07, + "loss": 0.70782828, + "num_input_tokens_seen": 128508920, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.30078125, + "step": 5977, + "time_per_iteration": 2.9557223320007324 + }, + { + "auxiliary_loss_clip": 0.0148695, + "auxiliary_loss_mlp": 0.01261399, + "balance_loss_clip": 1.13198185, + "balance_loss_mlp": 1.03194511, + "epoch": 0.7188120002404857, + "flos": 29244875444640.0, + "grad_norm": 2.3550680713575343, + "language_loss": 0.707838, + "learning_rate": 7.734843340298329e-07, + "loss": 0.73532146, + "num_input_tokens_seen": 128528745, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.29296875, + "step": 5978, + "time_per_iteration": 3.067960023880005 + }, + { + "auxiliary_loss_clip": 0.01491522, + "auxiliary_loss_mlp": 0.01268022, + "balance_loss_clip": 1.1375885, + "balance_loss_mlp": 1.03360879, + "epoch": 0.7189322431311249, + "flos": 33403967290080.0, + "grad_norm": 1.9285589694191063, + "language_loss": 0.75200105, + "learning_rate": 7.72869130773895e-07, + "loss": 0.77959651, + "num_input_tokens_seen": 128549345, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.33789062, + "step": 5979, + "time_per_iteration": 3.090108633041382 + }, + { + "auxiliary_loss_clip": 0.01446952, + "auxiliary_loss_mlp": 0.01192825, + "balance_loss_clip": 1.10804451, + "balance_loss_mlp": 1.00285339, + "epoch": 0.719052486021764, + "flos": 61357792980960.0, + "grad_norm": 0.7949396790217753, + "language_loss": 0.59298605, + "learning_rate": 7.722541136727343e-07, + "loss": 0.61938387, + "num_input_tokens_seen": 128605360, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.8984375, + "step": 5980, + "time_per_iteration": 3.252713203430176 + }, + { + "auxiliary_loss_clip": 0.01488889, + "auxiliary_loss_mlp": 0.01264212, + "balance_loss_clip": 1.13468528, + "balance_loss_mlp": 1.03151584, + "epoch": 0.719172728912403, + "flos": 15598642358880.0, + "grad_norm": 2.221655123633944, + "language_loss": 0.80964559, + "learning_rate": 7.716392828196483e-07, + "loss": 0.83717662, + "num_input_tokens_seen": 128623160, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.32226562, + "step": 5981, + "time_per_iteration": 3.2182419300079346 + }, + { + "auxiliary_loss_clip": 0.0149119, + "auxiliary_loss_mlp": 0.01270087, + "balance_loss_clip": 1.13593411, + "balance_loss_mlp": 1.03586447, + "epoch": 0.7192929718030422, + "flos": 15554607400800.0, + "grad_norm": 2.5840434329216064, + "language_loss": 0.77265394, + "learning_rate": 7.710246383079064e-07, + "loss": 0.80026674, + "num_input_tokens_seen": 128638545, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.3359375, + "step": 5982, + "time_per_iteration": 3.8921210765838623 + }, + { + "auxiliary_loss_clip": 0.01488708, + "auxiliary_loss_mlp": 0.01263205, + "balance_loss_clip": 1.13518476, + "balance_loss_mlp": 1.03108072, + "epoch": 0.7194132146936812, + "flos": 21864133628640.0, + "grad_norm": 2.6355027001197087, + "language_loss": 0.91955775, + "learning_rate": 7.704101802307492e-07, + "loss": 0.9470768, + "num_input_tokens_seen": 128650845, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.31640625, + "step": 5983, + "time_per_iteration": 3.0722172260284424 + }, + { + "auxiliary_loss_clip": 0.01491661, + "auxiliary_loss_mlp": 0.01267495, + "balance_loss_clip": 1.13571095, + "balance_loss_mlp": 1.03537035, + "epoch": 0.7195334575843203, + "flos": 27341052508800.0, + "grad_norm": 3.064023202149081, + "language_loss": 0.87103963, + "learning_rate": 7.697959086813912e-07, + "loss": 0.89863122, + "num_input_tokens_seen": 128667010, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.31835938, + "step": 5984, + "time_per_iteration": 3.102344036102295 + }, + { + "auxiliary_loss_clip": 0.01486793, + "auxiliary_loss_mlp": 0.01260033, + "balance_loss_clip": 1.13122296, + "balance_loss_mlp": 1.02905309, + "epoch": 0.7196537004749595, + "flos": 18772616268000.0, + "grad_norm": 1.615996141448698, + "language_loss": 0.80392963, + "learning_rate": 7.691818237530145e-07, + "loss": 0.83139789, + "num_input_tokens_seen": 128685870, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.30664062, + "step": 5985, + "time_per_iteration": 2.922320604324341 + }, + { + "auxiliary_loss_clip": 0.01496745, + "auxiliary_loss_mlp": 0.01274883, + "balance_loss_clip": 1.14209795, + "balance_loss_mlp": 1.04199624, + "epoch": 0.7197739433655985, + "flos": 24533088986880.0, + "grad_norm": 1.9623852948605807, + "language_loss": 0.77589077, + "learning_rate": 7.685679255387774e-07, + "loss": 0.80360705, + "num_input_tokens_seen": 128704185, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.32226562, + "step": 5986, + "time_per_iteration": 3.053194999694824 + }, + { + "auxiliary_loss_clip": 0.01487185, + "auxiliary_loss_mlp": 0.01266026, + "balance_loss_clip": 1.13041711, + "balance_loss_mlp": 1.03561854, + "epoch": 0.7198941862562376, + "flos": 18042529829760.0, + "grad_norm": 1.9692250265025413, + "language_loss": 0.76881218, + "learning_rate": 7.679542141318065e-07, + "loss": 0.79634428, + "num_input_tokens_seen": 128721290, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.30078125, + "step": 5987, + "time_per_iteration": 3.0130937099456787 + }, + { + "auxiliary_loss_clip": 0.01493005, + "auxiliary_loss_mlp": 0.01258373, + "balance_loss_clip": 1.13832521, + "balance_loss_mlp": 1.02872813, + "epoch": 0.7200144291468767, + "flos": 29024927858880.0, + "grad_norm": 1.768387639515764, + "language_loss": 0.7596612, + "learning_rate": 7.673406896252013e-07, + "loss": 0.787175, + "num_input_tokens_seen": 128742665, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.29296875, + "step": 5988, + "time_per_iteration": 3.0814602375030518 + }, + { + "auxiliary_loss_clip": 0.0149034, + "auxiliary_loss_mlp": 0.01265849, + "balance_loss_clip": 1.13431776, + "balance_loss_mlp": 1.03162622, + "epoch": 0.7201346720375158, + "flos": 25376619644640.0, + "grad_norm": 1.6110983092069322, + "language_loss": 0.78411621, + "learning_rate": 7.667273521120347e-07, + "loss": 0.81167805, + "num_input_tokens_seen": 128762225, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.33789062, + "step": 5989, + "time_per_iteration": 3.0185916423797607 + }, + { + "auxiliary_loss_clip": 0.01488019, + "auxiliary_loss_mlp": 0.01259911, + "balance_loss_clip": 1.1330719, + "balance_loss_mlp": 1.02645159, + "epoch": 0.7202549149281549, + "flos": 14357355079680.0, + "grad_norm": 5.4057432876334115, + "language_loss": 0.79668033, + "learning_rate": 7.661142016853468e-07, + "loss": 0.82415968, + "num_input_tokens_seen": 128779585, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.328125, + "step": 5990, + "time_per_iteration": 2.990490674972534 + }, + { + "auxiliary_loss_clip": 0.01484674, + "auxiliary_loss_mlp": 0.01258036, + "balance_loss_clip": 1.12934899, + "balance_loss_mlp": 1.02457619, + "epoch": 0.7203751578187939, + "flos": 23003848202400.0, + "grad_norm": 1.878412929779477, + "language_loss": 0.74592459, + "learning_rate": 7.655012384381543e-07, + "loss": 0.77335167, + "num_input_tokens_seen": 128799070, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.328125, + "step": 5991, + "time_per_iteration": 3.0702245235443115 + }, + { + "auxiliary_loss_clip": 0.01493703, + "auxiliary_loss_mlp": 0.01262189, + "balance_loss_clip": 1.13935447, + "balance_loss_mlp": 1.0291115, + "epoch": 0.7204954007094331, + "flos": 23694488989920.0, + "grad_norm": 2.030972019601029, + "language_loss": 0.81892204, + "learning_rate": 7.648884624634415e-07, + "loss": 0.84648097, + "num_input_tokens_seen": 128817620, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.32421875, + "step": 5992, + "time_per_iteration": 2.983552932739258 + }, + { + "auxiliary_loss_clip": 0.01492872, + "auxiliary_loss_mlp": 0.01265199, + "balance_loss_clip": 1.13788927, + "balance_loss_mlp": 1.03193045, + "epoch": 0.7206156436000721, + "flos": 16254881940960.0, + "grad_norm": 1.7885161153893296, + "language_loss": 0.88876116, + "learning_rate": 7.642758738541683e-07, + "loss": 0.91634184, + "num_input_tokens_seen": 128834200, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.32617188, + "step": 5993, + "time_per_iteration": 2.9458353519439697 + }, + { + "auxiliary_loss_clip": 0.01452613, + "auxiliary_loss_mlp": 0.01200508, + "balance_loss_clip": 1.1143415, + "balance_loss_mlp": 1.01053619, + "epoch": 0.7207358864907112, + "flos": 54383682975840.0, + "grad_norm": 2.8047404435399597, + "language_loss": 0.60696578, + "learning_rate": 7.636634727032621e-07, + "loss": 0.633497, + "num_input_tokens_seen": 128891305, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.8984375, + "step": 5994, + "time_per_iteration": 3.303480863571167 + }, + { + "auxiliary_loss_clip": 0.01493697, + "auxiliary_loss_mlp": 0.01270291, + "balance_loss_clip": 1.1388104, + "balance_loss_mlp": 1.03549671, + "epoch": 0.7208561293813504, + "flos": 19137564666720.0, + "grad_norm": 2.5703327579749184, + "language_loss": 0.79054964, + "learning_rate": 7.630512591036231e-07, + "loss": 0.8181895, + "num_input_tokens_seen": 128910615, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.34375, + "step": 5995, + "time_per_iteration": 3.936488628387451 + }, + { + "auxiliary_loss_clip": 0.01494457, + "auxiliary_loss_mlp": 0.0126006, + "balance_loss_clip": 1.14049053, + "balance_loss_mlp": 1.02927089, + "epoch": 0.7209763722719894, + "flos": 17750442155040.0, + "grad_norm": 2.565980454137725, + "language_loss": 0.64302576, + "learning_rate": 7.624392331481255e-07, + "loss": 0.67057097, + "num_input_tokens_seen": 128928270, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.3046875, + "step": 5996, + "time_per_iteration": 2.9712798595428467 + }, + { + "auxiliary_loss_clip": 0.01451339, + "auxiliary_loss_mlp": 0.01196007, + "balance_loss_clip": 1.11211324, + "balance_loss_mlp": 1.00717926, + "epoch": 0.7210966151626285, + "flos": 66826064240640.0, + "grad_norm": 0.7561537637559609, + "language_loss": 0.51801193, + "learning_rate": 7.618273949296115e-07, + "loss": 0.54448539, + "num_input_tokens_seen": 128987780, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.88671875, + "step": 5997, + "time_per_iteration": 3.4342007637023926 + }, + { + "auxiliary_loss_clip": 0.01484007, + "auxiliary_loss_mlp": 0.01255003, + "balance_loss_clip": 1.12664926, + "balance_loss_mlp": 1.0245949, + "epoch": 0.7212168580532676, + "flos": 21143984368320.0, + "grad_norm": 2.316771810434802, + "language_loss": 0.68927473, + "learning_rate": 7.612157445408987e-07, + "loss": 0.71666479, + "num_input_tokens_seen": 129005590, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.30273438, + "step": 5998, + "time_per_iteration": 3.0868871212005615 + }, + { + "auxiliary_loss_clip": 0.01492327, + "auxiliary_loss_mlp": 0.01269571, + "balance_loss_clip": 1.13695443, + "balance_loss_mlp": 1.0345856, + "epoch": 0.7213371009439067, + "flos": 22347912045600.0, + "grad_norm": 2.088658917522751, + "language_loss": 0.74461979, + "learning_rate": 7.606042820747716e-07, + "loss": 0.77223873, + "num_input_tokens_seen": 129021995, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.34570312, + "step": 5999, + "time_per_iteration": 4.911109447479248 + }, + { + "auxiliary_loss_clip": 0.01488633, + "auxiliary_loss_mlp": 0.0126742, + "balance_loss_clip": 1.13232803, + "balance_loss_mlp": 1.03682137, + "epoch": 0.7214573438345457, + "flos": 18517926123360.0, + "grad_norm": 1.8503596313549653, + "language_loss": 0.8499707, + "learning_rate": 7.599930076239889e-07, + "loss": 0.87753123, + "num_input_tokens_seen": 129039280, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.30273438, + "step": 6000, + "time_per_iteration": 3.139126777648926 + }, + { + "auxiliary_loss_clip": 0.01491152, + "auxiliary_loss_mlp": 0.01270787, + "balance_loss_clip": 1.13608479, + "balance_loss_mlp": 1.03732717, + "epoch": 0.7215775867251849, + "flos": 35739037776960.0, + "grad_norm": 1.8658209993714117, + "language_loss": 0.70681739, + "learning_rate": 7.593819212812818e-07, + "loss": 0.73443675, + "num_input_tokens_seen": 129060860, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.33203125, + "step": 6001, + "time_per_iteration": 3.156470775604248 + }, + { + "auxiliary_loss_clip": 0.01485602, + "auxiliary_loss_mlp": 0.012702, + "balance_loss_clip": 1.13039351, + "balance_loss_mlp": 1.03998303, + "epoch": 0.721697829615824, + "flos": 20374300566720.0, + "grad_norm": 1.7717942785547305, + "language_loss": 0.71846366, + "learning_rate": 7.587710231393508e-07, + "loss": 0.74602175, + "num_input_tokens_seen": 129079215, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.29882812, + "step": 6002, + "time_per_iteration": 3.0288689136505127 + }, + { + "auxiliary_loss_clip": 0.01485681, + "auxiliary_loss_mlp": 0.01255824, + "balance_loss_clip": 1.13004899, + "balance_loss_mlp": 1.02560735, + "epoch": 0.721818072506463, + "flos": 20231878868640.0, + "grad_norm": 2.319917941852383, + "language_loss": 0.83443397, + "learning_rate": 7.581603132908685e-07, + "loss": 0.86184907, + "num_input_tokens_seen": 129097185, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.30078125, + "step": 6003, + "time_per_iteration": 3.0597450733184814 + }, + { + "auxiliary_loss_clip": 0.01485779, + "auxiliary_loss_mlp": 0.01268961, + "balance_loss_clip": 1.13049126, + "balance_loss_mlp": 1.03664601, + "epoch": 0.7219383153971022, + "flos": 18188858128320.0, + "grad_norm": 2.094739801774641, + "language_loss": 0.78660965, + "learning_rate": 7.575497918284795e-07, + "loss": 0.81415707, + "num_input_tokens_seen": 129114730, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.31835938, + "step": 6004, + "time_per_iteration": 3.0291783809661865 + }, + { + "auxiliary_loss_clip": 0.01484619, + "auxiliary_loss_mlp": 0.01273125, + "balance_loss_clip": 1.12867081, + "balance_loss_mlp": 1.0394752, + "epoch": 0.7220585582877412, + "flos": 17343734487840.0, + "grad_norm": 3.4995847742724377, + "language_loss": 0.74879014, + "learning_rate": 7.569394588447984e-07, + "loss": 0.7763676, + "num_input_tokens_seen": 129131745, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.33203125, + "step": 6005, + "time_per_iteration": 2.911322593688965 + }, + { + "auxiliary_loss_clip": 0.01484039, + "auxiliary_loss_mlp": 0.0125313, + "balance_loss_clip": 1.12855363, + "balance_loss_mlp": 1.02424777, + "epoch": 0.7221788011783803, + "flos": 16977875813280.0, + "grad_norm": 3.32170558330541, + "language_loss": 0.78415501, + "learning_rate": 7.563293144324146e-07, + "loss": 0.81152672, + "num_input_tokens_seen": 129147295, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.28710938, + "step": 6006, + "time_per_iteration": 2.992987632751465 + }, + { + "auxiliary_loss_clip": 0.01486875, + "auxiliary_loss_mlp": 0.01255803, + "balance_loss_clip": 1.13131881, + "balance_loss_mlp": 1.02882802, + "epoch": 0.7222990440690195, + "flos": 26288763072480.0, + "grad_norm": 2.6146444728099807, + "language_loss": 0.80462623, + "learning_rate": 7.557193586838834e-07, + "loss": 0.83205301, + "num_input_tokens_seen": 129162660, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.27148438, + "step": 6007, + "time_per_iteration": 3.000959634780884 + }, + { + "auxiliary_loss_clip": 0.01482478, + "auxiliary_loss_mlp": 0.01268107, + "balance_loss_clip": 1.12614822, + "balance_loss_mlp": 1.0363636, + "epoch": 0.7224192869596585, + "flos": 17603544934080.0, + "grad_norm": 2.3756267920557894, + "language_loss": 0.70924795, + "learning_rate": 7.551095916917371e-07, + "loss": 0.73675382, + "num_input_tokens_seen": 129179990, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.3125, + "step": 6008, + "time_per_iteration": 2.9896390438079834 + }, + { + "auxiliary_loss_clip": 0.01486535, + "auxiliary_loss_mlp": 0.012721, + "balance_loss_clip": 1.12964678, + "balance_loss_mlp": 1.03482556, + "epoch": 0.7225395298502976, + "flos": 12933972882720.0, + "grad_norm": 3.5487434842180763, + "language_loss": 0.66347253, + "learning_rate": 7.545000135484758e-07, + "loss": 0.69105887, + "num_input_tokens_seen": 129197425, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.3671875, + "step": 6009, + "time_per_iteration": 3.0053293704986572 + }, + { + "auxiliary_loss_clip": 0.01486011, + "auxiliary_loss_mlp": 0.01269809, + "balance_loss_clip": 1.13020003, + "balance_loss_mlp": 1.03768492, + "epoch": 0.7226597727409367, + "flos": 29646538666560.0, + "grad_norm": 1.9050201625468608, + "language_loss": 0.63210815, + "learning_rate": 7.538906243465714e-07, + "loss": 0.6596663, + "num_input_tokens_seen": 129217560, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.31445312, + "step": 6010, + "time_per_iteration": 3.9077529907226562 + }, + { + "auxiliary_loss_clip": 0.01488703, + "auxiliary_loss_mlp": 0.01269495, + "balance_loss_clip": 1.13225365, + "balance_loss_mlp": 1.03546333, + "epoch": 0.7227800156315758, + "flos": 13773331442880.0, + "grad_norm": 2.7665252411140764, + "language_loss": 0.78642058, + "learning_rate": 7.5328142417847e-07, + "loss": 0.81400257, + "num_input_tokens_seen": 129234325, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.33789062, + "step": 6011, + "time_per_iteration": 3.032141923904419 + }, + { + "auxiliary_loss_clip": 0.01481798, + "auxiliary_loss_mlp": 0.01267723, + "balance_loss_clip": 1.12479687, + "balance_loss_mlp": 1.03712463, + "epoch": 0.7229002585222148, + "flos": 20303905173120.0, + "grad_norm": 1.701415563180235, + "language_loss": 0.69230419, + "learning_rate": 7.526724131365838e-07, + "loss": 0.7197994, + "num_input_tokens_seen": 129255280, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.30078125, + "step": 6012, + "time_per_iteration": 3.01646089553833 + }, + { + "auxiliary_loss_clip": 0.01492641, + "auxiliary_loss_mlp": 0.01269878, + "balance_loss_clip": 1.13603783, + "balance_loss_mlp": 1.03298533, + "epoch": 0.723020501412854, + "flos": 16583115516480.0, + "grad_norm": 1.915311764078498, + "language_loss": 0.70704997, + "learning_rate": 7.520635913133017e-07, + "loss": 0.73467517, + "num_input_tokens_seen": 129273910, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.36328125, + "step": 6013, + "time_per_iteration": 2.9374165534973145 + }, + { + "auxiliary_loss_clip": 0.01484929, + "auxiliary_loss_mlp": 0.01277349, + "balance_loss_clip": 1.12827611, + "balance_loss_mlp": 1.040838, + "epoch": 0.7231407443034931, + "flos": 28550821122720.0, + "grad_norm": 1.8863486403105572, + "language_loss": 0.82343858, + "learning_rate": 7.514549588009798e-07, + "loss": 0.85106134, + "num_input_tokens_seen": 129294785, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.359375, + "step": 6014, + "time_per_iteration": 3.052133560180664 + }, + { + "auxiliary_loss_clip": 0.01485792, + "auxiliary_loss_mlp": 0.012646, + "balance_loss_clip": 1.12986803, + "balance_loss_mlp": 1.03495479, + "epoch": 0.7232609871941321, + "flos": 30011183640000.0, + "grad_norm": 3.394703545515408, + "language_loss": 0.71188009, + "learning_rate": 7.508465156919492e-07, + "loss": 0.73938394, + "num_input_tokens_seen": 129318295, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.29296875, + "step": 6015, + "time_per_iteration": 3.0695130825042725 + }, + { + "auxiliary_loss_clip": 0.01494703, + "auxiliary_loss_mlp": 0.01270271, + "balance_loss_clip": 1.13876653, + "balance_loss_mlp": 1.03623927, + "epoch": 0.7233812300847713, + "flos": 16655862456000.0, + "grad_norm": 3.2405509976268734, + "language_loss": 0.61331105, + "learning_rate": 7.502382620785083e-07, + "loss": 0.64096081, + "num_input_tokens_seen": 129334845, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.33398438, + "step": 6016, + "time_per_iteration": 3.0259270668029785 + }, + { + "auxiliary_loss_clip": 0.01451329, + "auxiliary_loss_mlp": 0.0119706, + "balance_loss_clip": 1.11384153, + "balance_loss_mlp": 1.00937653, + "epoch": 0.7235014729754103, + "flos": 67265466346080.0, + "grad_norm": 0.8125747445439576, + "language_loss": 0.62499166, + "learning_rate": 7.496301980529289e-07, + "loss": 0.65147555, + "num_input_tokens_seen": 129398055, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.875, + "step": 6017, + "time_per_iteration": 3.4746692180633545 + }, + { + "auxiliary_loss_clip": 0.01492362, + "auxiliary_loss_mlp": 0.01260965, + "balance_loss_clip": 1.13747191, + "balance_loss_mlp": 1.02979398, + "epoch": 0.7236217158660494, + "flos": 26945685361440.0, + "grad_norm": 2.2879112102486756, + "language_loss": 0.74693745, + "learning_rate": 7.490223237074547e-07, + "loss": 0.77447069, + "num_input_tokens_seen": 129417765, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.30859375, + "step": 6018, + "time_per_iteration": 3.0075008869171143 + }, + { + "auxiliary_loss_clip": 0.0149019, + "auxiliary_loss_mlp": 0.01265489, + "balance_loss_clip": 1.13608718, + "balance_loss_mlp": 1.03031313, + "epoch": 0.7237419587566886, + "flos": 29425908373920.0, + "grad_norm": 1.9957074128246088, + "language_loss": 0.66306818, + "learning_rate": 7.484146391342989e-07, + "loss": 0.69062501, + "num_input_tokens_seen": 129437560, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.34570312, + "step": 6019, + "time_per_iteration": 3.0874598026275635 + }, + { + "auxiliary_loss_clip": 0.01492543, + "auxiliary_loss_mlp": 0.01271394, + "balance_loss_clip": 1.13730311, + "balance_loss_mlp": 1.04174888, + "epoch": 0.7238622016473276, + "flos": 17823416663520.0, + "grad_norm": 2.0605859018276185, + "language_loss": 0.57278419, + "learning_rate": 7.478071444256484e-07, + "loss": 0.60042351, + "num_input_tokens_seen": 129455320, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.29296875, + "step": 6020, + "time_per_iteration": 3.0748298168182373 + }, + { + "auxiliary_loss_clip": 0.0149501, + "auxiliary_loss_mlp": 0.01261534, + "balance_loss_clip": 1.14130306, + "balance_loss_mlp": 1.02864683, + "epoch": 0.7239824445379667, + "flos": 25741492187040.0, + "grad_norm": 2.9605531500049795, + "language_loss": 0.79470843, + "learning_rate": 7.471998396736579e-07, + "loss": 0.82227385, + "num_input_tokens_seen": 129475700, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.32421875, + "step": 6021, + "time_per_iteration": 3.019472122192383 + }, + { + "auxiliary_loss_clip": 0.01490809, + "auxiliary_loss_mlp": 0.01273961, + "balance_loss_clip": 1.13566804, + "balance_loss_mlp": 1.04164553, + "epoch": 0.7241026874286057, + "flos": 23151162633120.0, + "grad_norm": 3.873916788650757, + "language_loss": 0.76303113, + "learning_rate": 7.465927249704549e-07, + "loss": 0.79067886, + "num_input_tokens_seen": 129493585, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.31835938, + "step": 6022, + "time_per_iteration": 3.2460737228393555 + }, + { + "auxiliary_loss_clip": 0.01487979, + "auxiliary_loss_mlp": 0.01263778, + "balance_loss_clip": 1.13316429, + "balance_loss_mlp": 1.03279757, + "epoch": 0.7242229303192449, + "flos": 20269010901600.0, + "grad_norm": 2.5611437221183815, + "language_loss": 0.77668685, + "learning_rate": 7.459858004081398e-07, + "loss": 0.80420434, + "num_input_tokens_seen": 129511555, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.3046875, + "step": 6023, + "time_per_iteration": 3.9037795066833496 + }, + { + "auxiliary_loss_clip": 0.01451791, + "auxiliary_loss_mlp": 0.01193207, + "balance_loss_clip": 1.11409879, + "balance_loss_mlp": 1.00285339, + "epoch": 0.724343173209884, + "flos": 62318902027680.0, + "grad_norm": 0.6582793154776828, + "language_loss": 0.57965958, + "learning_rate": 7.453790660787815e-07, + "loss": 0.60610956, + "num_input_tokens_seen": 129579650, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.90234375, + "step": 6024, + "time_per_iteration": 3.5476226806640625 + }, + { + "auxiliary_loss_clip": 0.01490069, + "auxiliary_loss_mlp": 0.01264983, + "balance_loss_clip": 1.13307858, + "balance_loss_mlp": 1.02999806, + "epoch": 0.724463416100523, + "flos": 35009292692160.0, + "grad_norm": 2.399095695803683, + "language_loss": 0.6377396, + "learning_rate": 7.447725220744214e-07, + "loss": 0.66529012, + "num_input_tokens_seen": 129601895, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.34765625, + "step": 6025, + "time_per_iteration": 3.0697803497314453 + }, + { + "auxiliary_loss_clip": 0.01487976, + "auxiliary_loss_mlp": 0.01267066, + "balance_loss_clip": 1.13190722, + "balance_loss_mlp": 1.03551412, + "epoch": 0.7245836589911622, + "flos": 21874146662880.0, + "grad_norm": 2.184381985044751, + "language_loss": 0.77400571, + "learning_rate": 7.441661684870717e-07, + "loss": 0.80155611, + "num_input_tokens_seen": 129622150, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.31054688, + "step": 6026, + "time_per_iteration": 3.0125110149383545 + }, + { + "auxiliary_loss_clip": 0.01489583, + "auxiliary_loss_mlp": 0.012618, + "balance_loss_clip": 1.13405466, + "balance_loss_mlp": 1.03005755, + "epoch": 0.7247039018818012, + "flos": 23008930575840.0, + "grad_norm": 2.035395598667386, + "language_loss": 0.81822073, + "learning_rate": 7.435600054087152e-07, + "loss": 0.8457346, + "num_input_tokens_seen": 129644315, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.31445312, + "step": 6027, + "time_per_iteration": 4.717727184295654 + }, + { + "auxiliary_loss_clip": 0.01496959, + "auxiliary_loss_mlp": 0.01257089, + "balance_loss_clip": 1.14254665, + "balance_loss_mlp": 1.02496457, + "epoch": 0.7248241447724403, + "flos": 31725288097920.0, + "grad_norm": 3.436224564622526, + "language_loss": 0.74524766, + "learning_rate": 7.42954032931308e-07, + "loss": 0.77278811, + "num_input_tokens_seen": 129665355, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.31640625, + "step": 6028, + "time_per_iteration": 3.040637493133545 + }, + { + "auxiliary_loss_clip": 0.0148693, + "auxiliary_loss_mlp": 0.01264973, + "balance_loss_clip": 1.13214016, + "balance_loss_mlp": 1.03017807, + "epoch": 0.7249443876630794, + "flos": 34899868857600.0, + "grad_norm": 2.0571417945064496, + "language_loss": 0.74858868, + "learning_rate": 7.423482511467733e-07, + "loss": 0.77610767, + "num_input_tokens_seen": 129686125, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.34179688, + "step": 6029, + "time_per_iteration": 3.14241623878479 + }, + { + "auxiliary_loss_clip": 0.01496845, + "auxiliary_loss_mlp": 0.01267317, + "balance_loss_clip": 1.14223409, + "balance_loss_mlp": 1.03614616, + "epoch": 0.7250646305537185, + "flos": 26361699652800.0, + "grad_norm": 2.5904790993999294, + "language_loss": 0.65297389, + "learning_rate": 7.417426601470099e-07, + "loss": 0.68061554, + "num_input_tokens_seen": 129706485, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.30859375, + "step": 6030, + "time_per_iteration": 2.9834344387054443 + }, + { + "auxiliary_loss_clip": 0.01495593, + "auxiliary_loss_mlp": 0.01269799, + "balance_loss_clip": 1.14044631, + "balance_loss_mlp": 1.03538585, + "epoch": 0.7251848734443576, + "flos": 30084271932960.0, + "grad_norm": 3.2670015458432182, + "language_loss": 0.78710616, + "learning_rate": 7.411372600238841e-07, + "loss": 0.81476009, + "num_input_tokens_seen": 129727100, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.33984375, + "step": 6031, + "time_per_iteration": 3.034507989883423 + }, + { + "auxiliary_loss_clip": 0.01490628, + "auxiliary_loss_mlp": 0.0126107, + "balance_loss_clip": 1.13564885, + "balance_loss_mlp": 1.02818274, + "epoch": 0.7253051163349967, + "flos": 17787194906400.0, + "grad_norm": 2.4229300876047484, + "language_loss": 0.74159586, + "learning_rate": 7.405320508692346e-07, + "loss": 0.76911288, + "num_input_tokens_seen": 129745840, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.32421875, + "step": 6032, + "time_per_iteration": 2.9951529502868652 + }, + { + "auxiliary_loss_clip": 0.01490278, + "auxiliary_loss_mlp": 0.01265616, + "balance_loss_clip": 1.13514602, + "balance_loss_mlp": 1.03482711, + "epoch": 0.7254253592256358, + "flos": 12643023052800.0, + "grad_norm": 1.9085614809747093, + "language_loss": 0.75606257, + "learning_rate": 7.399270327748727e-07, + "loss": 0.78362155, + "num_input_tokens_seen": 129763500, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.3046875, + "step": 6033, + "time_per_iteration": 2.960710287094116 + }, + { + "auxiliary_loss_clip": 0.01491073, + "auxiliary_loss_mlp": 0.01263898, + "balance_loss_clip": 1.1362052, + "balance_loss_mlp": 1.03387141, + "epoch": 0.7255456021162748, + "flos": 27201702991680.0, + "grad_norm": 2.884864236280552, + "language_loss": 0.745471, + "learning_rate": 7.39322205832577e-07, + "loss": 0.77302068, + "num_input_tokens_seen": 129784390, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.29882812, + "step": 6034, + "time_per_iteration": 3.08919620513916 + }, + { + "auxiliary_loss_clip": 0.01490683, + "auxiliary_loss_mlp": 0.01267766, + "balance_loss_clip": 1.13615859, + "balance_loss_mlp": 1.03430676, + "epoch": 0.725665845006914, + "flos": 21290350595040.0, + "grad_norm": 2.3345238361875102, + "language_loss": 0.80705428, + "learning_rate": 7.387175701341009e-07, + "loss": 0.83463883, + "num_input_tokens_seen": 129803060, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.33007812, + "step": 6035, + "time_per_iteration": 3.0299127101898193 + }, + { + "auxiliary_loss_clip": 0.01488842, + "auxiliary_loss_mlp": 0.01270162, + "balance_loss_clip": 1.13283348, + "balance_loss_mlp": 1.03841889, + "epoch": 0.7257860878975531, + "flos": 16035503277600.0, + "grad_norm": 9.808363421642294, + "language_loss": 0.72226334, + "learning_rate": 7.381131257711659e-07, + "loss": 0.74985343, + "num_input_tokens_seen": 129820165, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.3125, + "step": 6036, + "time_per_iteration": 3.790951728820801 + }, + { + "auxiliary_loss_clip": 0.01490218, + "auxiliary_loss_mlp": 0.01268185, + "balance_loss_clip": 1.1360178, + "balance_loss_mlp": 1.03548837, + "epoch": 0.7259063307881921, + "flos": 12131556714720.0, + "grad_norm": 1.9505167041007643, + "language_loss": 0.83786839, + "learning_rate": 7.375088728354677e-07, + "loss": 0.86545241, + "num_input_tokens_seen": 129835195, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.32226562, + "step": 6037, + "time_per_iteration": 3.0776100158691406 + }, + { + "auxiliary_loss_clip": 0.01492406, + "auxiliary_loss_mlp": 0.01265775, + "balance_loss_clip": 1.13730931, + "balance_loss_mlp": 1.03193402, + "epoch": 0.7260265736788313, + "flos": 30446375719680.0, + "grad_norm": 1.463982320393211, + "language_loss": 0.67557496, + "learning_rate": 7.369048114186691e-07, + "loss": 0.70315677, + "num_input_tokens_seen": 129856240, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.33203125, + "step": 6038, + "time_per_iteration": 3.1055750846862793 + }, + { + "auxiliary_loss_clip": 0.01496801, + "auxiliary_loss_mlp": 0.0125019, + "balance_loss_clip": 1.14270675, + "balance_loss_mlp": 1.02073622, + "epoch": 0.7261468165694703, + "flos": 21144401578080.0, + "grad_norm": 1.7560010782629405, + "language_loss": 0.83213383, + "learning_rate": 7.363009416124055e-07, + "loss": 0.85960376, + "num_input_tokens_seen": 129875565, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.29296875, + "step": 6039, + "time_per_iteration": 3.0973782539367676 + }, + { + "auxiliary_loss_clip": 0.01489539, + "auxiliary_loss_mlp": 0.01261395, + "balance_loss_clip": 1.13381338, + "balance_loss_mlp": 1.02850795, + "epoch": 0.7262670594601094, + "flos": 22308314682240.0, + "grad_norm": 2.256264432642336, + "language_loss": 0.62620878, + "learning_rate": 7.356972635082852e-07, + "loss": 0.65371811, + "num_input_tokens_seen": 129894420, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.32617188, + "step": 6040, + "time_per_iteration": 3.0265984535217285 + }, + { + "auxiliary_loss_clip": 0.01486858, + "auxiliary_loss_mlp": 0.01263613, + "balance_loss_clip": 1.13057351, + "balance_loss_mlp": 1.02939034, + "epoch": 0.7263873023507486, + "flos": 25337249850240.0, + "grad_norm": 1.7324578520365241, + "language_loss": 0.75755453, + "learning_rate": 7.35093777197884e-07, + "loss": 0.78505921, + "num_input_tokens_seen": 129914490, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.3359375, + "step": 6041, + "time_per_iteration": 3.0354273319244385 + }, + { + "auxiliary_loss_clip": 0.01488729, + "auxiliary_loss_mlp": 0.0126116, + "balance_loss_clip": 1.13309157, + "balance_loss_mlp": 1.03075218, + "epoch": 0.7265075452413876, + "flos": 23880907717920.0, + "grad_norm": 2.350168138425173, + "language_loss": 0.8595233, + "learning_rate": 7.344904827727525e-07, + "loss": 0.88702226, + "num_input_tokens_seen": 129931670, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.30078125, + "step": 6042, + "time_per_iteration": 3.062136650085449 + }, + { + "auxiliary_loss_clip": 0.01491201, + "auxiliary_loss_mlp": 0.01266328, + "balance_loss_clip": 1.13604546, + "balance_loss_mlp": 1.0353477, + "epoch": 0.7266277881320267, + "flos": 28726733750400.0, + "grad_norm": 10.41070708513587, + "language_loss": 0.73215991, + "learning_rate": 7.338873803244076e-07, + "loss": 0.75973523, + "num_input_tokens_seen": 129946905, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.30859375, + "step": 6043, + "time_per_iteration": 3.0633060932159424 + }, + { + "auxiliary_loss_clip": 0.01483416, + "auxiliary_loss_mlp": 0.01261859, + "balance_loss_clip": 1.12669611, + "balance_loss_mlp": 1.03049779, + "epoch": 0.7267480310226658, + "flos": 24865836013440.0, + "grad_norm": 1.8668235154256207, + "language_loss": 0.80711663, + "learning_rate": 7.332844699443401e-07, + "loss": 0.83456933, + "num_input_tokens_seen": 129965505, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.30664062, + "step": 6044, + "time_per_iteration": 3.0168755054473877 + }, + { + "auxiliary_loss_clip": 0.01487104, + "auxiliary_loss_mlp": 0.01259356, + "balance_loss_clip": 1.13039231, + "balance_loss_mlp": 1.02856684, + "epoch": 0.7268682739133049, + "flos": 27200944428480.0, + "grad_norm": 1.8604720585862213, + "language_loss": 0.75663501, + "learning_rate": 7.326817517240121e-07, + "loss": 0.78409958, + "num_input_tokens_seen": 129987210, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.3046875, + "step": 6045, + "time_per_iteration": 3.1398751735687256 + }, + { + "auxiliary_loss_clip": 0.01485029, + "auxiliary_loss_mlp": 0.01256345, + "balance_loss_clip": 1.12983489, + "balance_loss_mlp": 1.02689052, + "epoch": 0.7269885168039439, + "flos": 33510812009760.0, + "grad_norm": 2.4931008409259183, + "language_loss": 0.83565426, + "learning_rate": 7.320792257548545e-07, + "loss": 0.86306798, + "num_input_tokens_seen": 130008385, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.29101562, + "step": 6046, + "time_per_iteration": 3.0820062160491943 + }, + { + "auxiliary_loss_clip": 0.01489074, + "auxiliary_loss_mlp": 0.01268735, + "balance_loss_clip": 1.13314867, + "balance_loss_mlp": 1.03661108, + "epoch": 0.7271087596945831, + "flos": 24315910156800.0, + "grad_norm": 1.9855525250651638, + "language_loss": 0.76183796, + "learning_rate": 7.314768921282704e-07, + "loss": 0.78941607, + "num_input_tokens_seen": 130029040, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.31640625, + "step": 6047, + "time_per_iteration": 3.000620126724243 + }, + { + "auxiliary_loss_clip": 0.0148733, + "auxiliary_loss_mlp": 0.01268868, + "balance_loss_clip": 1.13277316, + "balance_loss_mlp": 1.03960466, + "epoch": 0.7272290025852222, + "flos": 23807402215200.0, + "grad_norm": 2.9415642333977114, + "language_loss": 0.72158748, + "learning_rate": 7.30874750935633e-07, + "loss": 0.74914944, + "num_input_tokens_seen": 130048725, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.2890625, + "step": 6048, + "time_per_iteration": 3.048642158508301 + }, + { + "auxiliary_loss_clip": 0.01485802, + "auxiliary_loss_mlp": 0.01258326, + "balance_loss_clip": 1.13032854, + "balance_loss_mlp": 1.02658308, + "epoch": 0.7273492454758612, + "flos": 16721858183040.0, + "grad_norm": 2.220055756062777, + "language_loss": 0.79555005, + "learning_rate": 7.30272802268286e-07, + "loss": 0.82299137, + "num_input_tokens_seen": 130065720, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.3125, + "step": 6049, + "time_per_iteration": 3.1012887954711914 + }, + { + "auxiliary_loss_clip": 0.0148804, + "auxiliary_loss_mlp": 0.01249943, + "balance_loss_clip": 1.13277125, + "balance_loss_mlp": 1.02544785, + "epoch": 0.7274694883665004, + "flos": 28033475919840.0, + "grad_norm": 1.7071350360652733, + "language_loss": 0.76407808, + "learning_rate": 7.29671046217547e-07, + "loss": 0.79145789, + "num_input_tokens_seen": 130084830, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.24804688, + "step": 6050, + "time_per_iteration": 3.9223670959472656 + }, + { + "auxiliary_loss_clip": 0.01483354, + "auxiliary_loss_mlp": 0.01254265, + "balance_loss_clip": 1.12799919, + "balance_loss_mlp": 1.0259552, + "epoch": 0.7275897312571394, + "flos": 30375335547360.0, + "grad_norm": 1.808041983237513, + "language_loss": 0.81891692, + "learning_rate": 7.290694828746988e-07, + "loss": 0.84629315, + "num_input_tokens_seen": 130104495, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.28320312, + "step": 6051, + "time_per_iteration": 3.045325517654419 + }, + { + "auxiliary_loss_clip": 0.01489566, + "auxiliary_loss_mlp": 0.01256019, + "balance_loss_clip": 1.13457656, + "balance_loss_mlp": 1.02522969, + "epoch": 0.7277099741477785, + "flos": 19206594646560.0, + "grad_norm": 1.9729059631881243, + "language_loss": 0.8531633, + "learning_rate": 7.284681123310004e-07, + "loss": 0.88061917, + "num_input_tokens_seen": 130123210, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.3046875, + "step": 6052, + "time_per_iteration": 2.999241828918457 + }, + { + "auxiliary_loss_clip": 0.0148684, + "auxiliary_loss_mlp": 0.01268593, + "balance_loss_clip": 1.13140726, + "balance_loss_mlp": 1.0364691, + "epoch": 0.7278302170384175, + "flos": 20669991416640.0, + "grad_norm": 4.069334750165475, + "language_loss": 0.79629052, + "learning_rate": 7.27866934677678e-07, + "loss": 0.82384479, + "num_input_tokens_seen": 130142880, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.31835938, + "step": 6053, + "time_per_iteration": 3.048025131225586 + }, + { + "auxiliary_loss_clip": 0.01483421, + "auxiliary_loss_mlp": 0.01261999, + "balance_loss_clip": 1.1282835, + "balance_loss_mlp": 1.03101885, + "epoch": 0.7279504599290567, + "flos": 19094895122400.0, + "grad_norm": 2.2219042728184606, + "language_loss": 0.78170127, + "learning_rate": 7.272659500059297e-07, + "loss": 0.80915546, + "num_input_tokens_seen": 130160220, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.30664062, + "step": 6054, + "time_per_iteration": 3.9961986541748047 + }, + { + "auxiliary_loss_clip": 0.01490699, + "auxiliary_loss_mlp": 0.01268787, + "balance_loss_clip": 1.13495755, + "balance_loss_mlp": 1.03742599, + "epoch": 0.7280707028196958, + "flos": 19064172948480.0, + "grad_norm": 2.103129680357426, + "language_loss": 0.80239171, + "learning_rate": 7.266651584069264e-07, + "loss": 0.82998657, + "num_input_tokens_seen": 130177885, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.30664062, + "step": 6055, + "time_per_iteration": 3.020285129547119 + }, + { + "auxiliary_loss_clip": 0.01489043, + "auxiliary_loss_mlp": 0.01262079, + "balance_loss_clip": 1.13354731, + "balance_loss_mlp": 1.03090835, + "epoch": 0.7281909457103348, + "flos": 37199362366080.0, + "grad_norm": 1.7683193940828728, + "language_loss": 0.57260895, + "learning_rate": 7.260645599718045e-07, + "loss": 0.60012019, + "num_input_tokens_seen": 130204240, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.30859375, + "step": 6056, + "time_per_iteration": 3.3003196716308594 + }, + { + "auxiliary_loss_clip": 0.01489414, + "auxiliary_loss_mlp": 0.01273636, + "balance_loss_clip": 1.13409758, + "balance_loss_mlp": 1.03922272, + "epoch": 0.728311188600974, + "flos": 20669156997120.0, + "grad_norm": 6.484569006112854, + "language_loss": 0.67471802, + "learning_rate": 7.254641547916767e-07, + "loss": 0.70234859, + "num_input_tokens_seen": 130221735, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.34179688, + "step": 6057, + "time_per_iteration": 3.1286468505859375 + }, + { + "auxiliary_loss_clip": 0.01486529, + "auxiliary_loss_mlp": 0.01258918, + "balance_loss_clip": 1.13048172, + "balance_loss_mlp": 1.02622116, + "epoch": 0.728431431491613, + "flos": 28843250150880.0, + "grad_norm": 2.018083715267473, + "language_loss": 0.68818855, + "learning_rate": 7.248639429576226e-07, + "loss": 0.71564305, + "num_input_tokens_seen": 130241190, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.32226562, + "step": 6058, + "time_per_iteration": 3.250401496887207 + }, + { + "auxiliary_loss_clip": 0.01487888, + "auxiliary_loss_mlp": 0.01260269, + "balance_loss_clip": 1.13154912, + "balance_loss_mlp": 1.03119624, + "epoch": 0.7285516743822521, + "flos": 25994096282880.0, + "grad_norm": 1.8048904914674124, + "language_loss": 0.72249067, + "learning_rate": 7.242639245606959e-07, + "loss": 0.74997222, + "num_input_tokens_seen": 130260980, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.28710938, + "step": 6059, + "time_per_iteration": 3.1049394607543945 + }, + { + "auxiliary_loss_clip": 0.01486877, + "auxiliary_loss_mlp": 0.0126709, + "balance_loss_clip": 1.13076305, + "balance_loss_mlp": 1.03420258, + "epoch": 0.7286719172728913, + "flos": 16401627449280.0, + "grad_norm": 1.8001774598670903, + "language_loss": 0.82592386, + "learning_rate": 7.236640996919168e-07, + "loss": 0.85346353, + "num_input_tokens_seen": 130280025, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.32226562, + "step": 6060, + "time_per_iteration": 2.9841854572296143 + }, + { + "auxiliary_loss_clip": 0.0148539, + "auxiliary_loss_mlp": 0.01264914, + "balance_loss_clip": 1.13046443, + "balance_loss_mlp": 1.03259897, + "epoch": 0.7287921601635303, + "flos": 22020361176960.0, + "grad_norm": 1.6262213416582105, + "language_loss": 0.70395923, + "learning_rate": 7.230644684422782e-07, + "loss": 0.73146224, + "num_input_tokens_seen": 130300255, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.31640625, + "step": 6061, + "time_per_iteration": 3.0393667221069336 + }, + { + "auxiliary_loss_clip": 0.01482552, + "auxiliary_loss_mlp": 0.01256233, + "balance_loss_clip": 1.1266284, + "balance_loss_mlp": 1.02544367, + "epoch": 0.7289124030541694, + "flos": 24602839601760.0, + "grad_norm": 1.907387807136403, + "language_loss": 0.81920886, + "learning_rate": 7.224650309027451e-07, + "loss": 0.84659666, + "num_input_tokens_seen": 130320005, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.30273438, + "step": 6062, + "time_per_iteration": 3.063716411590576 + }, + { + "auxiliary_loss_clip": 0.0148883, + "auxiliary_loss_mlp": 0.01263935, + "balance_loss_clip": 1.1333077, + "balance_loss_mlp": 1.03257418, + "epoch": 0.7290326459448085, + "flos": 21395564403840.0, + "grad_norm": 1.6936615927346887, + "language_loss": 0.6873157, + "learning_rate": 7.218657871642506e-07, + "loss": 0.71484327, + "num_input_tokens_seen": 130338810, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.3125, + "step": 6063, + "time_per_iteration": 3.1330227851867676 + }, + { + "auxiliary_loss_clip": 0.0149355, + "auxiliary_loss_mlp": 0.01273988, + "balance_loss_clip": 1.13799405, + "balance_loss_mlp": 1.041291, + "epoch": 0.7291528888354476, + "flos": 18589649002560.0, + "grad_norm": 2.1389943677056564, + "language_loss": 0.62364066, + "learning_rate": 7.212667373177012e-07, + "loss": 0.65131605, + "num_input_tokens_seen": 130353805, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.32226562, + "step": 6064, + "time_per_iteration": 3.8732798099517822 + }, + { + "auxiliary_loss_clip": 0.01483865, + "auxiliary_loss_mlp": 0.01262744, + "balance_loss_clip": 1.12727368, + "balance_loss_mlp": 1.03214574, + "epoch": 0.7292731317260867, + "flos": 18952966490400.0, + "grad_norm": 1.8896808582116895, + "language_loss": 0.75249243, + "learning_rate": 7.206678814539704e-07, + "loss": 0.77995849, + "num_input_tokens_seen": 130372105, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.30273438, + "step": 6065, + "time_per_iteration": 2.9711878299713135 + }, + { + "auxiliary_loss_clip": 0.01490956, + "auxiliary_loss_mlp": 0.0126114, + "balance_loss_clip": 1.13574207, + "balance_loss_mlp": 1.0326401, + "epoch": 0.7293933746167258, + "flos": 21069606517920.0, + "grad_norm": 1.6383668868874939, + "language_loss": 0.72632068, + "learning_rate": 7.20069219663904e-07, + "loss": 0.75384164, + "num_input_tokens_seen": 130391990, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.28320312, + "step": 6066, + "time_per_iteration": 3.0549681186676025 + }, + { + "auxiliary_loss_clip": 0.01483158, + "auxiliary_loss_mlp": 0.01271585, + "balance_loss_clip": 1.12692571, + "balance_loss_mlp": 1.03965139, + "epoch": 0.7295136175073649, + "flos": 22455401544000.0, + "grad_norm": 1.7586566762672982, + "language_loss": 0.79425305, + "learning_rate": 7.1947075203832e-07, + "loss": 0.82180053, + "num_input_tokens_seen": 130411970, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.31445312, + "step": 6067, + "time_per_iteration": 3.026108980178833 + }, + { + "auxiliary_loss_clip": 0.01449053, + "auxiliary_loss_mlp": 0.01193069, + "balance_loss_clip": 1.10984159, + "balance_loss_mlp": 1.00424194, + "epoch": 0.7296338603980039, + "flos": 56131278363360.0, + "grad_norm": 0.8877308582987975, + "language_loss": 0.60145986, + "learning_rate": 7.188724786680049e-07, + "loss": 0.62788105, + "num_input_tokens_seen": 130472440, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.88671875, + "step": 6068, + "time_per_iteration": 3.4306867122650146 + }, + { + "auxiliary_loss_clip": 0.01481338, + "auxiliary_loss_mlp": 0.01259111, + "balance_loss_clip": 1.12507319, + "balance_loss_mlp": 1.02736819, + "epoch": 0.7297541032886431, + "flos": 25230632699520.0, + "grad_norm": 1.8613617193804683, + "language_loss": 0.75757253, + "learning_rate": 7.182743996437162e-07, + "loss": 0.78497696, + "num_input_tokens_seen": 130491975, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.31445312, + "step": 6069, + "time_per_iteration": 3.0290143489837646 + }, + { + "auxiliary_loss_clip": 0.01488858, + "auxiliary_loss_mlp": 0.0126555, + "balance_loss_clip": 1.13344336, + "balance_loss_mlp": 1.02980185, + "epoch": 0.7298743461792822, + "flos": 26469947714400.0, + "grad_norm": 1.8824399982602698, + "language_loss": 0.68930721, + "learning_rate": 7.176765150561819e-07, + "loss": 0.71685129, + "num_input_tokens_seen": 130510580, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.35351562, + "step": 6070, + "time_per_iteration": 2.995896577835083 + }, + { + "auxiliary_loss_clip": 0.01483945, + "auxiliary_loss_mlp": 0.01258248, + "balance_loss_clip": 1.12874365, + "balance_loss_mlp": 1.02917504, + "epoch": 0.7299945890699212, + "flos": 19570860338400.0, + "grad_norm": 3.2212528425336946, + "language_loss": 0.79843998, + "learning_rate": 7.170788249961002e-07, + "loss": 0.82586193, + "num_input_tokens_seen": 130529090, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.29101562, + "step": 6071, + "time_per_iteration": 2.9030840396881104 + }, + { + "auxiliary_loss_clip": 0.01482053, + "auxiliary_loss_mlp": 0.01260708, + "balance_loss_clip": 1.12639463, + "balance_loss_mlp": 1.03220809, + "epoch": 0.7301148319605604, + "flos": 22931139191040.0, + "grad_norm": 1.9824827606977449, + "language_loss": 0.88189459, + "learning_rate": 7.164813295541418e-07, + "loss": 0.9093222, + "num_input_tokens_seen": 130548655, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.28320312, + "step": 6072, + "time_per_iteration": 3.071972131729126 + }, + { + "auxiliary_loss_clip": 0.01481548, + "auxiliary_loss_mlp": 0.01276123, + "balance_loss_clip": 1.12533426, + "balance_loss_mlp": 1.04285383, + "epoch": 0.7302350748511994, + "flos": 25371802768320.0, + "grad_norm": 1.6161546666272524, + "language_loss": 0.70361519, + "learning_rate": 7.15884028820944e-07, + "loss": 0.73119187, + "num_input_tokens_seen": 130567710, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.32617188, + "step": 6073, + "time_per_iteration": 2.948732376098633 + }, + { + "auxiliary_loss_clip": 0.01481422, + "auxiliary_loss_mlp": 0.01257922, + "balance_loss_clip": 1.12597728, + "balance_loss_mlp": 1.02408147, + "epoch": 0.7303553177418385, + "flos": 27821682888480.0, + "grad_norm": 3.1084259432941903, + "language_loss": 0.60206592, + "learning_rate": 7.152869228871185e-07, + "loss": 0.62945938, + "num_input_tokens_seen": 130590195, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.33203125, + "step": 6074, + "time_per_iteration": 3.0196053981781006 + }, + { + "auxiliary_loss_clip": 0.01490768, + "auxiliary_loss_mlp": 0.01263652, + "balance_loss_clip": 1.13613367, + "balance_loss_mlp": 1.03114629, + "epoch": 0.7304755606324776, + "flos": 24428861310240.0, + "grad_norm": 2.000450713883, + "language_loss": 0.7215451, + "learning_rate": 7.146900118432457e-07, + "loss": 0.74908924, + "num_input_tokens_seen": 130609940, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.31835938, + "step": 6075, + "time_per_iteration": 3.055103063583374 + }, + { + "auxiliary_loss_clip": 0.01483292, + "auxiliary_loss_mlp": 0.0126118, + "balance_loss_clip": 1.12737393, + "balance_loss_mlp": 1.02905583, + "epoch": 0.7305958035231167, + "flos": 23842675768320.0, + "grad_norm": 1.6333313865823258, + "language_loss": 0.85689092, + "learning_rate": 7.140932957798753e-07, + "loss": 0.88433564, + "num_input_tokens_seen": 130628380, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.31640625, + "step": 6076, + "time_per_iteration": 3.0278003215789795 + }, + { + "auxiliary_loss_clip": 0.01491678, + "auxiliary_loss_mlp": 0.01267544, + "balance_loss_clip": 1.13600874, + "balance_loss_mlp": 1.03236771, + "epoch": 0.7307160464137558, + "flos": 16728799036320.0, + "grad_norm": 2.2448489124431523, + "language_loss": 0.71460128, + "learning_rate": 7.134967747875309e-07, + "loss": 0.74219346, + "num_input_tokens_seen": 130646590, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.34765625, + "step": 6077, + "time_per_iteration": 3.9053475856781006 + }, + { + "auxiliary_loss_clip": 0.01488709, + "auxiliary_loss_mlp": 0.01260357, + "balance_loss_clip": 1.13309956, + "balance_loss_mlp": 1.02918625, + "epoch": 0.7308362893043949, + "flos": 21800413591200.0, + "grad_norm": 2.5007182823123313, + "language_loss": 0.81795681, + "learning_rate": 7.129004489567014e-07, + "loss": 0.84544742, + "num_input_tokens_seen": 130664070, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.30859375, + "step": 6078, + "time_per_iteration": 3.0285468101501465 + }, + { + "auxiliary_loss_clip": 0.01484663, + "auxiliary_loss_mlp": 0.01254826, + "balance_loss_clip": 1.128649, + "balance_loss_mlp": 1.02460921, + "epoch": 0.730956532195034, + "flos": 10708933080960.0, + "grad_norm": 2.4239574778831052, + "language_loss": 0.78258789, + "learning_rate": 7.123043183778512e-07, + "loss": 0.80998278, + "num_input_tokens_seen": 130681400, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.296875, + "step": 6079, + "time_per_iteration": 2.9663188457489014 + }, + { + "auxiliary_loss_clip": 0.01484947, + "auxiliary_loss_mlp": 0.01270113, + "balance_loss_clip": 1.12981606, + "balance_loss_mlp": 1.03760695, + "epoch": 0.731076775085673, + "flos": 19794031817760.0, + "grad_norm": 1.563754962723424, + "language_loss": 0.65206379, + "learning_rate": 7.117083831414114e-07, + "loss": 0.67961437, + "num_input_tokens_seen": 130700675, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.31835938, + "step": 6080, + "time_per_iteration": 3.1975924968719482 + }, + { + "auxiliary_loss_clip": 0.01480589, + "auxiliary_loss_mlp": 0.01260566, + "balance_loss_clip": 1.12440026, + "balance_loss_mlp": 1.03034902, + "epoch": 0.7311970179763122, + "flos": 20449019770560.0, + "grad_norm": 2.74311062313631, + "language_loss": 0.6994549, + "learning_rate": 7.11112643337787e-07, + "loss": 0.72686636, + "num_input_tokens_seen": 130719720, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.29882812, + "step": 6081, + "time_per_iteration": 3.857058525085449 + }, + { + "auxiliary_loss_clip": 0.01483041, + "auxiliary_loss_mlp": 0.01264945, + "balance_loss_clip": 1.12754118, + "balance_loss_mlp": 1.03339267, + "epoch": 0.7313172608669513, + "flos": 18515726290080.0, + "grad_norm": 2.343518498120171, + "language_loss": 0.77258891, + "learning_rate": 7.10517099057349e-07, + "loss": 0.80006874, + "num_input_tokens_seen": 130736670, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.3125, + "step": 6082, + "time_per_iteration": 3.9347140789031982 + }, + { + "auxiliary_loss_clip": 0.01488705, + "auxiliary_loss_mlp": 0.01271402, + "balance_loss_clip": 1.13292956, + "balance_loss_mlp": 1.03946877, + "epoch": 0.7314375037575903, + "flos": 16182628067520.0, + "grad_norm": 2.74970881798404, + "language_loss": 0.61320859, + "learning_rate": 7.099217503904411e-07, + "loss": 0.64080966, + "num_input_tokens_seen": 130754525, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.31445312, + "step": 6083, + "time_per_iteration": 3.0477499961853027 + }, + { + "auxiliary_loss_clip": 0.0147899, + "auxiliary_loss_mlp": 0.01255818, + "balance_loss_clip": 1.12221503, + "balance_loss_mlp": 1.0236938, + "epoch": 0.7315577466482295, + "flos": 17969896674720.0, + "grad_norm": 1.8941386626393464, + "language_loss": 0.90025657, + "learning_rate": 7.093265974273788e-07, + "loss": 0.92760468, + "num_input_tokens_seen": 130772420, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.31835938, + "step": 6084, + "time_per_iteration": 3.031158447265625 + }, + { + "auxiliary_loss_clip": 0.01481527, + "auxiliary_loss_mlp": 0.01265431, + "balance_loss_clip": 1.12570083, + "balance_loss_mlp": 1.0317806, + "epoch": 0.7316779895388685, + "flos": 18407402372160.0, + "grad_norm": 1.8183377547748418, + "language_loss": 0.71836632, + "learning_rate": 7.087316402584447e-07, + "loss": 0.7458359, + "num_input_tokens_seen": 130791245, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.33203125, + "step": 6085, + "time_per_iteration": 2.9487524032592773 + }, + { + "auxiliary_loss_clip": 0.01485469, + "auxiliary_loss_mlp": 0.0127267, + "balance_loss_clip": 1.12954545, + "balance_loss_mlp": 1.0420711, + "epoch": 0.7317982324295076, + "flos": 17930071742400.0, + "grad_norm": 1.9244510674714892, + "language_loss": 0.86618018, + "learning_rate": 7.081368789738953e-07, + "loss": 0.89376158, + "num_input_tokens_seen": 130808445, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.30273438, + "step": 6086, + "time_per_iteration": 2.943990468978882 + }, + { + "auxiliary_loss_clip": 0.01480807, + "auxiliary_loss_mlp": 0.01255185, + "balance_loss_clip": 1.1246469, + "balance_loss_mlp": 1.02611208, + "epoch": 0.7319184753201466, + "flos": 27232007955840.0, + "grad_norm": 1.9815893978064782, + "language_loss": 0.77271748, + "learning_rate": 7.075423136639537e-07, + "loss": 0.80007744, + "num_input_tokens_seen": 130827700, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.2890625, + "step": 6087, + "time_per_iteration": 3.100909948348999 + }, + { + "auxiliary_loss_clip": 0.01480532, + "auxiliary_loss_mlp": 0.01265461, + "balance_loss_clip": 1.12329423, + "balance_loss_mlp": 1.03314555, + "epoch": 0.7320387182107858, + "flos": 37451435467680.0, + "grad_norm": 2.123228163417504, + "language_loss": 0.74683321, + "learning_rate": 7.069479444188149e-07, + "loss": 0.77429307, + "num_input_tokens_seen": 130848290, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.31835938, + "step": 6088, + "time_per_iteration": 3.2105894088745117 + }, + { + "auxiliary_loss_clip": 0.01483868, + "auxiliary_loss_mlp": 0.01261833, + "balance_loss_clip": 1.1268903, + "balance_loss_mlp": 1.0312351, + "epoch": 0.7321589611014249, + "flos": 17861003834400.0, + "grad_norm": 1.839119828728007, + "language_loss": 0.82245284, + "learning_rate": 7.063537713286453e-07, + "loss": 0.84990984, + "num_input_tokens_seen": 130865970, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.30273438, + "step": 6089, + "time_per_iteration": 3.009446382522583 + }, + { + "auxiliary_loss_clip": 0.01486529, + "auxiliary_loss_mlp": 0.01265586, + "balance_loss_clip": 1.13100958, + "balance_loss_mlp": 1.03193545, + "epoch": 0.7322792039920639, + "flos": 26102799482400.0, + "grad_norm": 1.8552227196693072, + "language_loss": 0.8053959, + "learning_rate": 7.057597944835803e-07, + "loss": 0.83291703, + "num_input_tokens_seen": 130885245, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.33203125, + "step": 6090, + "time_per_iteration": 2.9638729095458984 + }, + { + "auxiliary_loss_clip": 0.01482711, + "auxiliary_loss_mlp": 0.01258485, + "balance_loss_clip": 1.12621689, + "balance_loss_mlp": 1.02788663, + "epoch": 0.7323994468827031, + "flos": 25371157989600.0, + "grad_norm": 1.8251935145972582, + "language_loss": 0.74927211, + "learning_rate": 7.051660139737253e-07, + "loss": 0.77668411, + "num_input_tokens_seen": 130903465, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.30273438, + "step": 6091, + "time_per_iteration": 3.083655595779419 + }, + { + "auxiliary_loss_clip": 0.01484577, + "auxiliary_loss_mlp": 0.01266463, + "balance_loss_clip": 1.12894642, + "balance_loss_mlp": 1.03395772, + "epoch": 0.7325196897733421, + "flos": 26909539460640.0, + "grad_norm": 2.4780545654048427, + "language_loss": 0.76567531, + "learning_rate": 7.045724298891565e-07, + "loss": 0.79318565, + "num_input_tokens_seen": 130922935, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.3203125, + "step": 6092, + "time_per_iteration": 3.8548293113708496 + }, + { + "auxiliary_loss_clip": 0.01485527, + "auxiliary_loss_mlp": 0.0126046, + "balance_loss_clip": 1.13136744, + "balance_loss_mlp": 1.02852678, + "epoch": 0.7326399326639812, + "flos": 25778169082080.0, + "grad_norm": 2.05070414159381, + "language_loss": 0.69556493, + "learning_rate": 7.039790423199192e-07, + "loss": 0.72302479, + "num_input_tokens_seen": 130942575, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.31445312, + "step": 6093, + "time_per_iteration": 2.9546074867248535 + }, + { + "auxiliary_loss_clip": 0.01494048, + "auxiliary_loss_mlp": 0.01261532, + "balance_loss_clip": 1.13869357, + "balance_loss_mlp": 1.02788162, + "epoch": 0.7327601755546204, + "flos": 21034370892960.0, + "grad_norm": 2.0507868238777975, + "language_loss": 0.78092498, + "learning_rate": 7.033858513560322e-07, + "loss": 0.80848074, + "num_input_tokens_seen": 130958870, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.33007812, + "step": 6094, + "time_per_iteration": 3.029839277267456 + }, + { + "auxiliary_loss_clip": 0.0148872, + "auxiliary_loss_mlp": 0.01264449, + "balance_loss_clip": 1.13299942, + "balance_loss_mlp": 1.03270614, + "epoch": 0.7328804184452594, + "flos": 16291065769920.0, + "grad_norm": 2.203862413953486, + "language_loss": 0.76235509, + "learning_rate": 7.027928570874794e-07, + "loss": 0.78988683, + "num_input_tokens_seen": 130977060, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.31445312, + "step": 6095, + "time_per_iteration": 3.014571189880371 + }, + { + "auxiliary_loss_clip": 0.01479708, + "auxiliary_loss_mlp": 0.0126343, + "balance_loss_clip": 1.12264514, + "balance_loss_mlp": 1.03187799, + "epoch": 0.7330006613358985, + "flos": 17860169414880.0, + "grad_norm": 2.449105288647986, + "language_loss": 0.85477322, + "learning_rate": 7.022000596042194e-07, + "loss": 0.88220459, + "num_input_tokens_seen": 130994160, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.3125, + "step": 6096, + "time_per_iteration": 3.112884998321533 + }, + { + "auxiliary_loss_clip": 0.014856, + "auxiliary_loss_mlp": 0.01263383, + "balance_loss_clip": 1.13004041, + "balance_loss_mlp": 1.03335643, + "epoch": 0.7331209042265376, + "flos": 22494278272320.0, + "grad_norm": 2.811214810398117, + "language_loss": 0.82020873, + "learning_rate": 7.016074589961784e-07, + "loss": 0.84769857, + "num_input_tokens_seen": 131012725, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.29882812, + "step": 6097, + "time_per_iteration": 3.1150412559509277 + }, + { + "auxiliary_loss_clip": 0.01488015, + "auxiliary_loss_mlp": 0.01256507, + "balance_loss_clip": 1.13135517, + "balance_loss_mlp": 1.02800632, + "epoch": 0.7332411471171767, + "flos": 33075544073760.0, + "grad_norm": 1.8103923054132576, + "language_loss": 0.67001837, + "learning_rate": 7.01015055353253e-07, + "loss": 0.69746357, + "num_input_tokens_seen": 131035150, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.28515625, + "step": 6098, + "time_per_iteration": 3.0805504322052 + }, + { + "auxiliary_loss_clip": 0.01494259, + "auxiliary_loss_mlp": 0.01257852, + "balance_loss_clip": 1.13870096, + "balance_loss_mlp": 1.02553749, + "epoch": 0.7333613900078157, + "flos": 22744985960160.0, + "grad_norm": 5.967236030193812, + "language_loss": 0.77910841, + "learning_rate": 7.004228487653123e-07, + "loss": 0.80662954, + "num_input_tokens_seen": 131055955, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.3203125, + "step": 6099, + "time_per_iteration": 3.2148404121398926 + }, + { + "auxiliary_loss_clip": 0.01482573, + "auxiliary_loss_mlp": 0.0125853, + "balance_loss_clip": 1.12613416, + "balance_loss_mlp": 1.02735972, + "epoch": 0.7334816328984549, + "flos": 22348329255360.0, + "grad_norm": 2.145066293120321, + "language_loss": 0.78431952, + "learning_rate": 6.998308393221906e-07, + "loss": 0.81173062, + "num_input_tokens_seen": 131074360, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.30859375, + "step": 6100, + "time_per_iteration": 3.0835795402526855 + }, + { + "auxiliary_loss_clip": 0.0149139, + "auxiliary_loss_mlp": 0.01262284, + "balance_loss_clip": 1.13477015, + "balance_loss_mlp": 1.03130436, + "epoch": 0.733601875789094, + "flos": 20737997336160.0, + "grad_norm": 3.232108944409734, + "language_loss": 0.71182507, + "learning_rate": 6.992390271136977e-07, + "loss": 0.73936176, + "num_input_tokens_seen": 131090070, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.30859375, + "step": 6101, + "time_per_iteration": 2.9708588123321533 + }, + { + "auxiliary_loss_clip": 0.01486566, + "auxiliary_loss_mlp": 0.01259698, + "balance_loss_clip": 1.13106608, + "balance_loss_mlp": 1.03100657, + "epoch": 0.733722118679733, + "flos": 22566532145760.0, + "grad_norm": 1.7359494583965447, + "language_loss": 0.86110055, + "learning_rate": 6.986474122296094e-07, + "loss": 0.88856316, + "num_input_tokens_seen": 131109185, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.28710938, + "step": 6102, + "time_per_iteration": 3.003188133239746 + }, + { + "auxiliary_loss_clip": 0.0149026, + "auxiliary_loss_mlp": 0.01265563, + "balance_loss_clip": 1.13438058, + "balance_loss_mlp": 1.03191257, + "epoch": 0.7338423615703722, + "flos": 20086271205120.0, + "grad_norm": 1.76326355895854, + "language_loss": 0.72528052, + "learning_rate": 6.980559947596751e-07, + "loss": 0.75283873, + "num_input_tokens_seen": 131127725, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.33203125, + "step": 6103, + "time_per_iteration": 3.0798635482788086 + }, + { + "auxiliary_loss_clip": 0.01487988, + "auxiliary_loss_mlp": 0.01256645, + "balance_loss_clip": 1.13348782, + "balance_loss_mlp": 1.02680969, + "epoch": 0.7339626044610112, + "flos": 21689738127360.0, + "grad_norm": 2.3725151170115124, + "language_loss": 0.75892305, + "learning_rate": 6.974647747936109e-07, + "loss": 0.78636932, + "num_input_tokens_seen": 131146110, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.29492188, + "step": 6104, + "time_per_iteration": 3.0870072841644287 + }, + { + "auxiliary_loss_clip": 0.01489047, + "auxiliary_loss_mlp": 0.01268815, + "balance_loss_clip": 1.13449216, + "balance_loss_mlp": 1.03688169, + "epoch": 0.7340828473516503, + "flos": 15270067429920.0, + "grad_norm": 2.405509208493158, + "language_loss": 0.8251847, + "learning_rate": 6.968737524211039e-07, + "loss": 0.85276335, + "num_input_tokens_seen": 131162920, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.31640625, + "step": 6105, + "time_per_iteration": 3.806905508041382 + }, + { + "auxiliary_loss_clip": 0.01490306, + "auxiliary_loss_mlp": 0.01275808, + "balance_loss_clip": 1.13470781, + "balance_loss_mlp": 1.04501915, + "epoch": 0.7342030902422895, + "flos": 22932352892160.0, + "grad_norm": 2.010058278282618, + "language_loss": 0.80511969, + "learning_rate": 6.962829277318132e-07, + "loss": 0.83278084, + "num_input_tokens_seen": 131182515, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.3046875, + "step": 6106, + "time_per_iteration": 2.929795265197754 + }, + { + "auxiliary_loss_clip": 0.01492884, + "auxiliary_loss_mlp": 0.01266228, + "balance_loss_clip": 1.13680816, + "balance_loss_mlp": 1.03696513, + "epoch": 0.7343233331329285, + "flos": 25850233314720.0, + "grad_norm": 2.0466402385340956, + "language_loss": 0.83976114, + "learning_rate": 6.956923008153652e-07, + "loss": 0.86735225, + "num_input_tokens_seen": 131202280, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.29101562, + "step": 6107, + "time_per_iteration": 3.0289101600646973 + }, + { + "auxiliary_loss_clip": 0.0148713, + "auxiliary_loss_mlp": 0.01257404, + "balance_loss_clip": 1.13229573, + "balance_loss_mlp": 1.02661479, + "epoch": 0.7344435760235676, + "flos": 18480983731200.0, + "grad_norm": 2.7190285336391113, + "language_loss": 0.84199631, + "learning_rate": 6.951018717613593e-07, + "loss": 0.86944169, + "num_input_tokens_seen": 131221295, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.30273438, + "step": 6108, + "time_per_iteration": 2.955191135406494 + }, + { + "auxiliary_loss_clip": 0.0148784, + "auxiliary_loss_mlp": 0.0125276, + "balance_loss_clip": 1.13257921, + "balance_loss_mlp": 1.02368748, + "epoch": 0.7345638189142067, + "flos": 17642004452640.0, + "grad_norm": 1.9197107087577374, + "language_loss": 0.78578347, + "learning_rate": 6.945116406593614e-07, + "loss": 0.81318939, + "num_input_tokens_seen": 131240150, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.2890625, + "step": 6109, + "time_per_iteration": 3.775726556777954 + }, + { + "auxiliary_loss_clip": 0.01485677, + "auxiliary_loss_mlp": 0.01266057, + "balance_loss_clip": 1.12924004, + "balance_loss_mlp": 1.03564954, + "epoch": 0.7346840618048458, + "flos": 20261956263840.0, + "grad_norm": 2.0867696001470595, + "language_loss": 0.74362105, + "learning_rate": 6.939216075989089e-07, + "loss": 0.77113843, + "num_input_tokens_seen": 131258080, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.30273438, + "step": 6110, + "time_per_iteration": 3.003868818283081 + }, + { + "auxiliary_loss_clip": 0.01486833, + "auxiliary_loss_mlp": 0.01268607, + "balance_loss_clip": 1.13081419, + "balance_loss_mlp": 1.03915286, + "epoch": 0.7348043046954849, + "flos": 29025762278400.0, + "grad_norm": 5.4533013576837215, + "language_loss": 0.65943003, + "learning_rate": 6.933317726695109e-07, + "loss": 0.68698442, + "num_input_tokens_seen": 131279310, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.29101562, + "step": 6111, + "time_per_iteration": 3.000488519668579 + }, + { + "auxiliary_loss_clip": 0.0148916, + "auxiliary_loss_mlp": 0.01256324, + "balance_loss_clip": 1.13274074, + "balance_loss_mlp": 1.02591634, + "epoch": 0.734924547586124, + "flos": 17933143923360.0, + "grad_norm": 2.759150466760238, + "language_loss": 0.7913695, + "learning_rate": 6.92742135960644e-07, + "loss": 0.81882435, + "num_input_tokens_seen": 131297010, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.30273438, + "step": 6112, + "time_per_iteration": 2.9075684547424316 + }, + { + "auxiliary_loss_clip": 0.01449011, + "auxiliary_loss_mlp": 0.01195526, + "balance_loss_clip": 1.1106329, + "balance_loss_mlp": 1.00326538, + "epoch": 0.7350447904767631, + "flos": 63595614572640.0, + "grad_norm": 0.8183599312171973, + "language_loss": 0.55577034, + "learning_rate": 6.921526975617556e-07, + "loss": 0.58221573, + "num_input_tokens_seen": 131356470, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 1.921875, + "step": 6113, + "time_per_iteration": 3.5015838146209717 + }, + { + "auxiliary_loss_clip": 0.01490202, + "auxiliary_loss_mlp": 0.01261287, + "balance_loss_clip": 1.13358068, + "balance_loss_mlp": 1.0285902, + "epoch": 0.7351650333674021, + "flos": 21582172772640.0, + "grad_norm": 1.8446892718996566, + "language_loss": 0.75591826, + "learning_rate": 6.915634575622631e-07, + "loss": 0.7834332, + "num_input_tokens_seen": 131374985, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.32226562, + "step": 6114, + "time_per_iteration": 3.006256341934204 + }, + { + "auxiliary_loss_clip": 0.01487197, + "auxiliary_loss_mlp": 0.01258878, + "balance_loss_clip": 1.13261724, + "balance_loss_mlp": 1.02999568, + "epoch": 0.7352852762580413, + "flos": 18188289205920.0, + "grad_norm": 1.822032306004081, + "language_loss": 0.70648408, + "learning_rate": 6.909744160515532e-07, + "loss": 0.73394477, + "num_input_tokens_seen": 131393125, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.28710938, + "step": 6115, + "time_per_iteration": 3.154676675796509 + }, + { + "auxiliary_loss_clip": 0.01492286, + "auxiliary_loss_mlp": 0.01267377, + "balance_loss_clip": 1.13524973, + "balance_loss_mlp": 1.03716052, + "epoch": 0.7354055191486804, + "flos": 38913163398720.0, + "grad_norm": 2.151255970030042, + "language_loss": 0.69395155, + "learning_rate": 6.903855731189849e-07, + "loss": 0.72154814, + "num_input_tokens_seen": 131415760, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.30078125, + "step": 6116, + "time_per_iteration": 3.2457032203674316 + }, + { + "auxiliary_loss_clip": 0.01493206, + "auxiliary_loss_mlp": 0.01260093, + "balance_loss_clip": 1.13867879, + "balance_loss_mlp": 1.03063929, + "epoch": 0.7355257620393194, + "flos": 16291862261280.0, + "grad_norm": 2.4484007221769244, + "language_loss": 0.82058769, + "learning_rate": 6.897969288538825e-07, + "loss": 0.84812075, + "num_input_tokens_seen": 131433705, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.29296875, + "step": 6117, + "time_per_iteration": 3.127060651779175 + }, + { + "auxiliary_loss_clip": 0.01480476, + "auxiliary_loss_mlp": 0.01269137, + "balance_loss_clip": 1.12417603, + "balance_loss_mlp": 1.03796649, + "epoch": 0.7356460049299585, + "flos": 18116224973280.0, + "grad_norm": 2.186488305802187, + "language_loss": 0.81481493, + "learning_rate": 6.892084833455452e-07, + "loss": 0.84231102, + "num_input_tokens_seen": 131453275, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.30664062, + "step": 6118, + "time_per_iteration": 3.0823476314544678 + }, + { + "auxiliary_loss_clip": 0.01488665, + "auxiliary_loss_mlp": 0.01261254, + "balance_loss_clip": 1.1335578, + "balance_loss_mlp": 1.032372, + "epoch": 0.7357662478205976, + "flos": 21327368843520.0, + "grad_norm": 1.9199962404542368, + "language_loss": 0.84050936, + "learning_rate": 6.886202366832384e-07, + "loss": 0.86800855, + "num_input_tokens_seen": 131474960, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.28710938, + "step": 6119, + "time_per_iteration": 3.8740479946136475 + }, + { + "auxiliary_loss_clip": 0.01487975, + "auxiliary_loss_mlp": 0.01263572, + "balance_loss_clip": 1.13211322, + "balance_loss_mlp": 1.03144765, + "epoch": 0.7358864907112367, + "flos": 14248613952000.0, + "grad_norm": 2.0497276034621774, + "language_loss": 0.73913032, + "learning_rate": 6.880321889561987e-07, + "loss": 0.76664585, + "num_input_tokens_seen": 131492935, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.31640625, + "step": 6120, + "time_per_iteration": 2.946626663208008 + }, + { + "auxiliary_loss_clip": 0.01487073, + "auxiliary_loss_mlp": 0.01267034, + "balance_loss_clip": 1.1303072, + "balance_loss_mlp": 1.03395581, + "epoch": 0.7360067336018757, + "flos": 22311879929280.0, + "grad_norm": 2.0744897414683123, + "language_loss": 0.65622926, + "learning_rate": 6.874443402536338e-07, + "loss": 0.6837703, + "num_input_tokens_seen": 131512025, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.32617188, + "step": 6121, + "time_per_iteration": 3.0862932205200195 + }, + { + "auxiliary_loss_clip": 0.01488571, + "auxiliary_loss_mlp": 0.01269218, + "balance_loss_clip": 1.13351798, + "balance_loss_mlp": 1.03652167, + "epoch": 0.7361269764925149, + "flos": 25556856082560.0, + "grad_norm": 1.6667941249765619, + "language_loss": 0.80096161, + "learning_rate": 6.868566906647177e-07, + "loss": 0.82853949, + "num_input_tokens_seen": 131532975, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.32226562, + "step": 6122, + "time_per_iteration": 3.0226211547851562 + }, + { + "auxiliary_loss_clip": 0.01491205, + "auxiliary_loss_mlp": 0.01271653, + "balance_loss_clip": 1.13605475, + "balance_loss_mlp": 1.0418179, + "epoch": 0.736247219383154, + "flos": 20378624376960.0, + "grad_norm": 1.8079755010604934, + "language_loss": 0.83903825, + "learning_rate": 6.862692402785984e-07, + "loss": 0.86666685, + "num_input_tokens_seen": 131553225, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.29492188, + "step": 6123, + "time_per_iteration": 2.973465919494629 + }, + { + "auxiliary_loss_clip": 0.01440458, + "auxiliary_loss_mlp": 0.01192841, + "balance_loss_clip": 1.10323191, + "balance_loss_mlp": 1.00325012, + "epoch": 0.736367462273793, + "flos": 70347273733440.0, + "grad_norm": 0.6799688482991747, + "language_loss": 0.49535137, + "learning_rate": 6.856819891843899e-07, + "loss": 0.52168441, + "num_input_tokens_seen": 131617930, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.89453125, + "step": 6124, + "time_per_iteration": 3.5424530506134033 + }, + { + "auxiliary_loss_clip": 0.01485421, + "auxiliary_loss_mlp": 0.0126555, + "balance_loss_clip": 1.13122296, + "balance_loss_mlp": 1.03170931, + "epoch": 0.7364877051644322, + "flos": 22414552551360.0, + "grad_norm": 2.031663077894603, + "language_loss": 0.72483325, + "learning_rate": 6.8509493747118e-07, + "loss": 0.75234294, + "num_input_tokens_seen": 131636740, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.3359375, + "step": 6125, + "time_per_iteration": 2.9554336071014404 + }, + { + "auxiliary_loss_clip": 0.01486849, + "auxiliary_loss_mlp": 0.01258974, + "balance_loss_clip": 1.13175201, + "balance_loss_mlp": 1.02856684, + "epoch": 0.7366079480550712, + "flos": 12131898068160.0, + "grad_norm": 2.4366715892343382, + "language_loss": 0.882038, + "learning_rate": 6.845080852280221e-07, + "loss": 0.90949631, + "num_input_tokens_seen": 131653810, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.30273438, + "step": 6126, + "time_per_iteration": 3.089036464691162 + }, + { + "auxiliary_loss_clip": 0.01485794, + "auxiliary_loss_mlp": 0.01265118, + "balance_loss_clip": 1.13043046, + "balance_loss_mlp": 1.03471065, + "epoch": 0.7367281909457103, + "flos": 15051143904480.0, + "grad_norm": 1.739275587581855, + "language_loss": 0.74492097, + "learning_rate": 6.839214325439409e-07, + "loss": 0.77243018, + "num_input_tokens_seen": 131671505, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.30078125, + "step": 6127, + "time_per_iteration": 3.0380899906158447 + }, + { + "auxiliary_loss_clip": 0.01484338, + "auxiliary_loss_mlp": 0.01263731, + "balance_loss_clip": 1.13073993, + "balance_loss_mlp": 1.03141558, + "epoch": 0.7368484338363495, + "flos": 23512849210080.0, + "grad_norm": 1.7560580385122937, + "language_loss": 0.71668601, + "learning_rate": 6.833349795079327e-07, + "loss": 0.74416667, + "num_input_tokens_seen": 131690615, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.3203125, + "step": 6128, + "time_per_iteration": 2.9913666248321533 + }, + { + "auxiliary_loss_clip": 0.01490544, + "auxiliary_loss_mlp": 0.01263566, + "balance_loss_clip": 1.13596272, + "balance_loss_mlp": 1.03239512, + "epoch": 0.7369686767269885, + "flos": 27420323091840.0, + "grad_norm": 2.2442207214880687, + "language_loss": 0.69080651, + "learning_rate": 6.827487262089613e-07, + "loss": 0.71834755, + "num_input_tokens_seen": 131711120, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.31054688, + "step": 6129, + "time_per_iteration": 3.044113874435425 + }, + { + "auxiliary_loss_clip": 0.01440913, + "auxiliary_loss_mlp": 0.01198273, + "balance_loss_clip": 1.10344648, + "balance_loss_mlp": 1.00830078, + "epoch": 0.7370889196176276, + "flos": 70300545876000.0, + "grad_norm": 0.929630426986814, + "language_loss": 0.56704855, + "learning_rate": 6.821626727359606e-07, + "loss": 0.59344041, + "num_input_tokens_seen": 131776680, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.8984375, + "step": 6130, + "time_per_iteration": 3.5212981700897217 + }, + { + "auxiliary_loss_clip": 0.01489903, + "auxiliary_loss_mlp": 0.01263676, + "balance_loss_clip": 1.13469315, + "balance_loss_mlp": 1.0317421, + "epoch": 0.7372091625082667, + "flos": 18042795326880.0, + "grad_norm": 2.5602884586385706, + "language_loss": 0.77388912, + "learning_rate": 6.815768191778348e-07, + "loss": 0.80142498, + "num_input_tokens_seen": 131794760, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.31640625, + "step": 6131, + "time_per_iteration": 2.960693120956421 + }, + { + "auxiliary_loss_clip": 0.0148483, + "auxiliary_loss_mlp": 0.01261623, + "balance_loss_clip": 1.1296463, + "balance_loss_mlp": 1.03121531, + "epoch": 0.7373294053989058, + "flos": 33728976972000.0, + "grad_norm": 1.7154306266889547, + "language_loss": 0.72584796, + "learning_rate": 6.809911656234569e-07, + "loss": 0.75331247, + "num_input_tokens_seen": 131816735, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.30273438, + "step": 6132, + "time_per_iteration": 3.070951223373413 + }, + { + "auxiliary_loss_clip": 0.01481399, + "auxiliary_loss_mlp": 0.01252535, + "balance_loss_clip": 1.12664986, + "balance_loss_mlp": 1.02193642, + "epoch": 0.7374496482895448, + "flos": 21508439700960.0, + "grad_norm": 13.82624156078391, + "language_loss": 0.78186941, + "learning_rate": 6.804057121616707e-07, + "loss": 0.80920875, + "num_input_tokens_seen": 131834940, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.3046875, + "step": 6133, + "time_per_iteration": 3.8328540325164795 + }, + { + "auxiliary_loss_clip": 0.01484714, + "auxiliary_loss_mlp": 0.01262849, + "balance_loss_clip": 1.13037312, + "balance_loss_mlp": 1.03129697, + "epoch": 0.737569891180184, + "flos": 24939265659840.0, + "grad_norm": 1.8702154863716491, + "language_loss": 0.72118062, + "learning_rate": 6.798204588812888e-07, + "loss": 0.74865627, + "num_input_tokens_seen": 131854355, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.30859375, + "step": 6134, + "time_per_iteration": 2.976505756378174 + }, + { + "auxiliary_loss_clip": 0.01480852, + "auxiliary_loss_mlp": 0.01247028, + "balance_loss_clip": 1.12583065, + "balance_loss_mlp": 1.01452255, + "epoch": 0.7376901340708231, + "flos": 20666464097760.0, + "grad_norm": 1.6883811945482345, + "language_loss": 0.75613976, + "learning_rate": 6.792354058710937e-07, + "loss": 0.7834186, + "num_input_tokens_seen": 131871825, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.3203125, + "step": 6135, + "time_per_iteration": 2.975952625274658 + }, + { + "auxiliary_loss_clip": 0.01476577, + "auxiliary_loss_mlp": 0.01251163, + "balance_loss_clip": 1.12004948, + "balance_loss_mlp": 1.02247167, + "epoch": 0.7378103769614621, + "flos": 23808084922080.0, + "grad_norm": 1.985682240474286, + "language_loss": 0.65093255, + "learning_rate": 6.786505532198374e-07, + "loss": 0.67820996, + "num_input_tokens_seen": 131890770, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.28515625, + "step": 6136, + "time_per_iteration": 3.9262945652008057 + }, + { + "auxiliary_loss_clip": 0.01483297, + "auxiliary_loss_mlp": 0.01255742, + "balance_loss_clip": 1.12779558, + "balance_loss_mlp": 1.02418935, + "epoch": 0.7379306198521013, + "flos": 22239436415040.0, + "grad_norm": 2.1821375852295284, + "language_loss": 0.85390866, + "learning_rate": 6.780659010162411e-07, + "loss": 0.88129902, + "num_input_tokens_seen": 131909720, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.3125, + "step": 6137, + "time_per_iteration": 4.000278949737549 + }, + { + "auxiliary_loss_clip": 0.01483288, + "auxiliary_loss_mlp": 0.01259974, + "balance_loss_clip": 1.12772274, + "balance_loss_mlp": 1.02765894, + "epoch": 0.7380508627427403, + "flos": 14904891462240.0, + "grad_norm": 1.6481859142203799, + "language_loss": 0.83142996, + "learning_rate": 6.774814493489975e-07, + "loss": 0.85886264, + "num_input_tokens_seen": 131927395, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.31835938, + "step": 6138, + "time_per_iteration": 3.205925226211548 + }, + { + "auxiliary_loss_clip": 0.01482694, + "auxiliary_loss_mlp": 0.01255037, + "balance_loss_clip": 1.12857699, + "balance_loss_mlp": 1.02653623, + "epoch": 0.7381711056333794, + "flos": 21687803791200.0, + "grad_norm": 1.8454777082880727, + "language_loss": 0.65937549, + "learning_rate": 6.768971983067655e-07, + "loss": 0.6867528, + "num_input_tokens_seen": 131947725, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.28320312, + "step": 6139, + "time_per_iteration": 3.0312654972076416 + }, + { + "auxiliary_loss_clip": 0.01440603, + "auxiliary_loss_mlp": 0.01194107, + "balance_loss_clip": 1.1040622, + "balance_loss_mlp": 1.00489807, + "epoch": 0.7382913485240186, + "flos": 52409843928000.0, + "grad_norm": 1.4354280716702084, + "language_loss": 0.6770975, + "learning_rate": 6.763131479781772e-07, + "loss": 0.70344466, + "num_input_tokens_seen": 131997485, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 1.890625, + "step": 6140, + "time_per_iteration": 3.2343251705169678 + }, + { + "auxiliary_loss_clip": 0.0148342, + "auxiliary_loss_mlp": 0.01252045, + "balance_loss_clip": 1.12874818, + "balance_loss_mlp": 1.02106524, + "epoch": 0.7384115914146576, + "flos": 21800982513600.0, + "grad_norm": 3.337307728467943, + "language_loss": 0.76198649, + "learning_rate": 6.757292984518316e-07, + "loss": 0.78934109, + "num_input_tokens_seen": 132016885, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.3046875, + "step": 6141, + "time_per_iteration": 2.9827375411987305 + }, + { + "auxiliary_loss_clip": 0.01440005, + "auxiliary_loss_mlp": 0.01191383, + "balance_loss_clip": 1.10354877, + "balance_loss_mlp": 1.00179291, + "epoch": 0.7385318343052967, + "flos": 61500707745120.0, + "grad_norm": 0.7522615106938143, + "language_loss": 0.56409758, + "learning_rate": 6.751456498162981e-07, + "loss": 0.59041148, + "num_input_tokens_seen": 132075920, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 1.89453125, + "step": 6142, + "time_per_iteration": 3.2315056324005127 + }, + { + "auxiliary_loss_clip": 0.0148222, + "auxiliary_loss_mlp": 0.01252977, + "balance_loss_clip": 1.12717366, + "balance_loss_mlp": 1.02371335, + "epoch": 0.7386520771959358, + "flos": 17015349199680.0, + "grad_norm": 2.0261260196818043, + "language_loss": 0.85424525, + "learning_rate": 6.745622021601174e-07, + "loss": 0.88159716, + "num_input_tokens_seen": 132092945, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.29101562, + "step": 6143, + "time_per_iteration": 2.9375741481781006 + }, + { + "auxiliary_loss_clip": 0.01481875, + "auxiliary_loss_mlp": 0.0126316, + "balance_loss_clip": 1.12671435, + "balance_loss_mlp": 1.03275228, + "epoch": 0.7387723200865749, + "flos": 18772767980640.0, + "grad_norm": 2.443052143038028, + "language_loss": 0.69837856, + "learning_rate": 6.739789555717954e-07, + "loss": 0.72582901, + "num_input_tokens_seen": 132109920, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.30078125, + "step": 6144, + "time_per_iteration": 2.9986159801483154 + }, + { + "auxiliary_loss_clip": 0.01482634, + "auxiliary_loss_mlp": 0.01261557, + "balance_loss_clip": 1.12789738, + "balance_loss_mlp": 1.03305626, + "epoch": 0.738892562977214, + "flos": 22527579561120.0, + "grad_norm": 2.1650671656506826, + "language_loss": 0.7742089, + "learning_rate": 6.733959101398124e-07, + "loss": 0.80165076, + "num_input_tokens_seen": 132128050, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.28515625, + "step": 6145, + "time_per_iteration": 2.9465129375457764 + }, + { + "auxiliary_loss_clip": 0.01487719, + "auxiliary_loss_mlp": 0.01256783, + "balance_loss_clip": 1.13290083, + "balance_loss_mlp": 1.0269475, + "epoch": 0.7390128058678531, + "flos": 21503319399360.0, + "grad_norm": 1.8457125151455818, + "language_loss": 0.81601393, + "learning_rate": 6.728130659526143e-07, + "loss": 0.84345895, + "num_input_tokens_seen": 132145860, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.296875, + "step": 6146, + "time_per_iteration": 3.801072120666504 + }, + { + "auxiliary_loss_clip": 0.01485331, + "auxiliary_loss_mlp": 0.01263478, + "balance_loss_clip": 1.13114738, + "balance_loss_mlp": 1.03040004, + "epoch": 0.7391330487584922, + "flos": 25778586291840.0, + "grad_norm": 3.575830484394533, + "language_loss": 0.70648646, + "learning_rate": 6.7223042309862e-07, + "loss": 0.73397452, + "num_input_tokens_seen": 132166060, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.32617188, + "step": 6147, + "time_per_iteration": 3.1324663162231445 + }, + { + "auxiliary_loss_clip": 0.01483438, + "auxiliary_loss_mlp": 0.01250211, + "balance_loss_clip": 1.12880921, + "balance_loss_mlp": 1.01961219, + "epoch": 0.7392532916491312, + "flos": 28369295127360.0, + "grad_norm": 2.0246579033248855, + "language_loss": 0.73983759, + "learning_rate": 6.716479816662144e-07, + "loss": 0.76717401, + "num_input_tokens_seen": 132187790, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.3046875, + "step": 6148, + "time_per_iteration": 2.9661548137664795 + }, + { + "auxiliary_loss_clip": 0.01485486, + "auxiliary_loss_mlp": 0.01259478, + "balance_loss_clip": 1.13174677, + "balance_loss_mlp": 1.0265907, + "epoch": 0.7393735345397703, + "flos": 23588099408160.0, + "grad_norm": 2.3601484273613225, + "language_loss": 0.73748815, + "learning_rate": 6.710657417437531e-07, + "loss": 0.76493782, + "num_input_tokens_seen": 132207495, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.32421875, + "step": 6149, + "time_per_iteration": 2.9892992973327637 + }, + { + "auxiliary_loss_clip": 0.01483587, + "auxiliary_loss_mlp": 0.01257301, + "balance_loss_clip": 1.1297648, + "balance_loss_mlp": 1.02613068, + "epoch": 0.7394937774304094, + "flos": 19976847370560.0, + "grad_norm": 2.7713251591748285, + "language_loss": 0.79870766, + "learning_rate": 6.704837034195628e-07, + "loss": 0.8261165, + "num_input_tokens_seen": 132225960, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.30664062, + "step": 6150, + "time_per_iteration": 2.9652886390686035 + }, + { + "auxiliary_loss_clip": 0.0148329, + "auxiliary_loss_mlp": 0.0126692, + "balance_loss_clip": 1.12948525, + "balance_loss_mlp": 1.03498614, + "epoch": 0.7396140203210485, + "flos": 23480344412640.0, + "grad_norm": 2.046290587459331, + "language_loss": 0.84831667, + "learning_rate": 6.699018667819376e-07, + "loss": 0.87581879, + "num_input_tokens_seen": 132245360, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.31445312, + "step": 6151, + "time_per_iteration": 3.0084710121154785 + }, + { + "auxiliary_loss_clip": 0.01483974, + "auxiliary_loss_mlp": 0.01259767, + "balance_loss_clip": 1.12894368, + "balance_loss_mlp": 1.02649844, + "epoch": 0.7397342632116876, + "flos": 25557728430240.0, + "grad_norm": 1.7060954830573174, + "language_loss": 0.72915626, + "learning_rate": 6.693202319191415e-07, + "loss": 0.7565937, + "num_input_tokens_seen": 132267095, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.328125, + "step": 6152, + "time_per_iteration": 3.0052857398986816 + }, + { + "auxiliary_loss_clip": 0.01490271, + "auxiliary_loss_mlp": 0.01255703, + "balance_loss_clip": 1.13548791, + "balance_loss_mlp": 1.02643967, + "epoch": 0.7398545061023267, + "flos": 24757398311040.0, + "grad_norm": 2.0004777092080497, + "language_loss": 0.74573314, + "learning_rate": 6.687387989194084e-07, + "loss": 0.77319288, + "num_input_tokens_seen": 132286610, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.29101562, + "step": 6153, + "time_per_iteration": 3.077213764190674 + }, + { + "auxiliary_loss_clip": 0.01482258, + "auxiliary_loss_mlp": 0.01251054, + "balance_loss_clip": 1.12749481, + "balance_loss_mlp": 1.02179074, + "epoch": 0.7399747489929658, + "flos": 16510482361440.0, + "grad_norm": 1.9606460882227024, + "language_loss": 0.79519045, + "learning_rate": 6.681575678709404e-07, + "loss": 0.82252359, + "num_input_tokens_seen": 132305300, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.28710938, + "step": 6154, + "time_per_iteration": 3.0105085372924805 + }, + { + "auxiliary_loss_clip": 0.01485451, + "auxiliary_loss_mlp": 0.01249868, + "balance_loss_clip": 1.13065243, + "balance_loss_mlp": 1.02022362, + "epoch": 0.7400949918836048, + "flos": 24099338177280.0, + "grad_norm": 3.22511311347225, + "language_loss": 0.70843214, + "learning_rate": 6.67576538861911e-07, + "loss": 0.73578537, + "num_input_tokens_seen": 132323875, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.29296875, + "step": 6155, + "time_per_iteration": 3.0102930068969727 + }, + { + "auxiliary_loss_clip": 0.01489414, + "auxiliary_loss_mlp": 0.01257137, + "balance_loss_clip": 1.13636756, + "balance_loss_mlp": 1.02825475, + "epoch": 0.740215234774244, + "flos": 21804889114080.0, + "grad_norm": 1.7677764025426164, + "language_loss": 0.82125783, + "learning_rate": 6.669957119804612e-07, + "loss": 0.84872329, + "num_input_tokens_seen": 132345510, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.28710938, + "step": 6156, + "time_per_iteration": 2.9931790828704834 + }, + { + "auxiliary_loss_clip": 0.0148575, + "auxiliary_loss_mlp": 0.01256359, + "balance_loss_clip": 1.13094258, + "balance_loss_mlp": 1.02442551, + "epoch": 0.7403354776648831, + "flos": 18735142881600.0, + "grad_norm": 2.9983675771254688, + "language_loss": 0.72618264, + "learning_rate": 6.66415087314702e-07, + "loss": 0.75360376, + "num_input_tokens_seen": 132360465, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.3203125, + "step": 6157, + "time_per_iteration": 2.968242645263672 + }, + { + "auxiliary_loss_clip": 0.01488015, + "auxiliary_loss_mlp": 0.01261428, + "balance_loss_clip": 1.13294649, + "balance_loss_mlp": 1.03330958, + "epoch": 0.7404557205555221, + "flos": 16911462876480.0, + "grad_norm": 2.733278769341118, + "language_loss": 0.72565472, + "learning_rate": 6.65834664952714e-07, + "loss": 0.75314915, + "num_input_tokens_seen": 132377915, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.27929688, + "step": 6158, + "time_per_iteration": 2.9369044303894043 + }, + { + "auxiliary_loss_clip": 0.01483894, + "auxiliary_loss_mlp": 0.01256774, + "balance_loss_clip": 1.12910914, + "balance_loss_mlp": 1.02808285, + "epoch": 0.7405759634461613, + "flos": 21216503738880.0, + "grad_norm": 1.9334816894666444, + "language_loss": 0.76023149, + "learning_rate": 6.652544449825457e-07, + "loss": 0.78763813, + "num_input_tokens_seen": 132398170, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.28320312, + "step": 6159, + "time_per_iteration": 3.0179436206817627 + }, + { + "auxiliary_loss_clip": 0.01489437, + "auxiliary_loss_mlp": 0.01275524, + "balance_loss_clip": 1.13370919, + "balance_loss_mlp": 1.04340029, + "epoch": 0.7406962063368003, + "flos": 20481903849600.0, + "grad_norm": 1.6687942745160806, + "language_loss": 0.76483327, + "learning_rate": 6.646744274922182e-07, + "loss": 0.79248285, + "num_input_tokens_seen": 132416615, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.31640625, + "step": 6160, + "time_per_iteration": 3.8195197582244873 + }, + { + "auxiliary_loss_clip": 0.01483425, + "auxiliary_loss_mlp": 0.01251627, + "balance_loss_clip": 1.12958026, + "balance_loss_mlp": 1.02140963, + "epoch": 0.7408164492274394, + "flos": 19793955961440.0, + "grad_norm": 6.591619248373461, + "language_loss": 0.75357711, + "learning_rate": 6.640946125697171e-07, + "loss": 0.78092766, + "num_input_tokens_seen": 132434145, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.29882812, + "step": 6161, + "time_per_iteration": 2.987163543701172 + }, + { + "auxiliary_loss_clip": 0.01482104, + "auxiliary_loss_mlp": 0.01261723, + "balance_loss_clip": 1.12838268, + "balance_loss_mlp": 1.02997971, + "epoch": 0.7409366921180786, + "flos": 29207402058240.0, + "grad_norm": 2.1081435737246053, + "language_loss": 0.76233619, + "learning_rate": 6.635150003030017e-07, + "loss": 0.78977448, + "num_input_tokens_seen": 132452670, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.3125, + "step": 6162, + "time_per_iteration": 3.0766119956970215 + }, + { + "auxiliary_loss_clip": 0.01479569, + "auxiliary_loss_mlp": 0.01259929, + "balance_loss_clip": 1.12510371, + "balance_loss_mlp": 1.0312382, + "epoch": 0.7410569350087176, + "flos": 22932239107680.0, + "grad_norm": 2.811736829877338, + "language_loss": 0.85643196, + "learning_rate": 6.629355907799981e-07, + "loss": 0.88382697, + "num_input_tokens_seen": 132472475, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.28320312, + "step": 6163, + "time_per_iteration": 4.051290988922119 + }, + { + "auxiliary_loss_clip": 0.0148025, + "auxiliary_loss_mlp": 0.01261517, + "balance_loss_clip": 1.12479281, + "balance_loss_mlp": 1.02691317, + "epoch": 0.7411771778993567, + "flos": 30442734616320.0, + "grad_norm": 1.6197899077716802, + "language_loss": 0.69416106, + "learning_rate": 6.623563840886015e-07, + "loss": 0.72157878, + "num_input_tokens_seen": 132493400, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.33984375, + "step": 6164, + "time_per_iteration": 2.970634937286377 + }, + { + "auxiliary_loss_clip": 0.0148489, + "auxiliary_loss_mlp": 0.01265976, + "balance_loss_clip": 1.13060355, + "balance_loss_mlp": 1.03938258, + "epoch": 0.7412974207899958, + "flos": 20524156184160.0, + "grad_norm": 1.6814035911313967, + "language_loss": 0.69482744, + "learning_rate": 6.617773803166795e-07, + "loss": 0.72233605, + "num_input_tokens_seen": 132511725, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.265625, + "step": 6165, + "time_per_iteration": 3.7881109714508057 + }, + { + "auxiliary_loss_clip": 0.01487925, + "auxiliary_loss_mlp": 0.01269583, + "balance_loss_clip": 1.13357711, + "balance_loss_mlp": 1.03612328, + "epoch": 0.7414176636806349, + "flos": 22092880547520.0, + "grad_norm": 3.916195060724823, + "language_loss": 0.81963277, + "learning_rate": 6.611985795520634e-07, + "loss": 0.84720784, + "num_input_tokens_seen": 132530270, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.33007812, + "step": 6166, + "time_per_iteration": 2.949815511703491 + }, + { + "auxiliary_loss_clip": 0.01492303, + "auxiliary_loss_mlp": 0.0126155, + "balance_loss_clip": 1.13828206, + "balance_loss_mlp": 1.02904439, + "epoch": 0.7415379065712739, + "flos": 25157582334720.0, + "grad_norm": 2.185012202247658, + "language_loss": 0.77246308, + "learning_rate": 6.606199818825588e-07, + "loss": 0.80000162, + "num_input_tokens_seen": 132550725, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.32226562, + "step": 6167, + "time_per_iteration": 2.9444127082824707 + }, + { + "auxiliary_loss_clip": 0.01483233, + "auxiliary_loss_mlp": 0.012597, + "balance_loss_clip": 1.12987506, + "balance_loss_mlp": 1.02872014, + "epoch": 0.7416581494619131, + "flos": 16873534352160.0, + "grad_norm": 1.9557240525048458, + "language_loss": 0.82241189, + "learning_rate": 6.600415873959377e-07, + "loss": 0.84984118, + "num_input_tokens_seen": 132568600, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.30664062, + "step": 6168, + "time_per_iteration": 2.9737589359283447 + }, + { + "auxiliary_loss_clip": 0.01487935, + "auxiliary_loss_mlp": 0.01253595, + "balance_loss_clip": 1.13383794, + "balance_loss_mlp": 1.02738345, + "epoch": 0.7417783923525522, + "flos": 28441397288160.0, + "grad_norm": 2.256311715841377, + "language_loss": 0.64999527, + "learning_rate": 6.594633961799437e-07, + "loss": 0.67741054, + "num_input_tokens_seen": 132587640, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.26367188, + "step": 6169, + "time_per_iteration": 2.971254825592041 + }, + { + "auxiliary_loss_clip": 0.01482392, + "auxiliary_loss_mlp": 0.01265473, + "balance_loss_clip": 1.12687659, + "balance_loss_mlp": 1.03182268, + "epoch": 0.7418986352431912, + "flos": 20086384989600.0, + "grad_norm": 1.73665582561413, + "language_loss": 0.81608737, + "learning_rate": 6.588854083222857e-07, + "loss": 0.843566, + "num_input_tokens_seen": 132607075, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.33007812, + "step": 6170, + "time_per_iteration": 3.0214312076568604 + }, + { + "auxiliary_loss_clip": 0.01486407, + "auxiliary_loss_mlp": 0.01261894, + "balance_loss_clip": 1.13280082, + "balance_loss_mlp": 1.0297699, + "epoch": 0.7420188781338304, + "flos": 18261529211520.0, + "grad_norm": 2.1741730595933535, + "language_loss": 0.80666077, + "learning_rate": 6.583076239106444e-07, + "loss": 0.83414376, + "num_input_tokens_seen": 132625580, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.31640625, + "step": 6171, + "time_per_iteration": 3.0161197185516357 + }, + { + "auxiliary_loss_clip": 0.01483873, + "auxiliary_loss_mlp": 0.01255763, + "balance_loss_clip": 1.12949657, + "balance_loss_mlp": 1.02459228, + "epoch": 0.7421391210244694, + "flos": 13773710724480.0, + "grad_norm": 2.1828792570800397, + "language_loss": 0.75243515, + "learning_rate": 6.577300430326707e-07, + "loss": 0.77983153, + "num_input_tokens_seen": 132640525, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.30664062, + "step": 6172, + "time_per_iteration": 2.969799757003784 + }, + { + "auxiliary_loss_clip": 0.014817, + "auxiliary_loss_mlp": 0.01250296, + "balance_loss_clip": 1.12768245, + "balance_loss_mlp": 1.01702738, + "epoch": 0.7422593639151085, + "flos": 15963552829440.0, + "grad_norm": 2.4901427588231164, + "language_loss": 0.72110748, + "learning_rate": 6.571526657759821e-07, + "loss": 0.74842745, + "num_input_tokens_seen": 132656265, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.328125, + "step": 6173, + "time_per_iteration": 3.085127830505371 + }, + { + "auxiliary_loss_clip": 0.01486514, + "auxiliary_loss_mlp": 0.01258411, + "balance_loss_clip": 1.13222945, + "balance_loss_mlp": 1.02743149, + "epoch": 0.7423796068057477, + "flos": 30116852586720.0, + "grad_norm": 2.4877700672320007, + "language_loss": 0.70604086, + "learning_rate": 6.565754922281663e-07, + "loss": 0.73349011, + "num_input_tokens_seen": 132678510, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.31054688, + "step": 6174, + "time_per_iteration": 4.031033039093018 + }, + { + "auxiliary_loss_clip": 0.01484077, + "auxiliary_loss_mlp": 0.01270332, + "balance_loss_clip": 1.12921095, + "balance_loss_mlp": 1.0378263, + "epoch": 0.7424998496963867, + "flos": 20524004471520.0, + "grad_norm": 1.937300725558492, + "language_loss": 0.7856096, + "learning_rate": 6.559985224767801e-07, + "loss": 0.81315362, + "num_input_tokens_seen": 132696385, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.31835938, + "step": 6175, + "time_per_iteration": 3.0940091609954834 + }, + { + "auxiliary_loss_clip": 0.01491494, + "auxiliary_loss_mlp": 0.01268258, + "balance_loss_clip": 1.1372515, + "balance_loss_mlp": 1.0349896, + "epoch": 0.7426200925870258, + "flos": 21873729453120.0, + "grad_norm": 3.1322694701577274, + "language_loss": 0.75825381, + "learning_rate": 6.55421756609349e-07, + "loss": 0.78585136, + "num_input_tokens_seen": 132714640, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.32617188, + "step": 6176, + "time_per_iteration": 2.995344400405884 + }, + { + "auxiliary_loss_clip": 0.01485536, + "auxiliary_loss_mlp": 0.01260626, + "balance_loss_clip": 1.12956512, + "balance_loss_mlp": 1.0317446, + "epoch": 0.7427403354776649, + "flos": 26434446592320.0, + "grad_norm": 2.109823863122191, + "language_loss": 0.78952152, + "learning_rate": 6.54845194713369e-07, + "loss": 0.81698316, + "num_input_tokens_seen": 132735590, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.2890625, + "step": 6177, + "time_per_iteration": 3.025904417037964 + }, + { + "auxiliary_loss_clip": 0.01491688, + "auxiliary_loss_mlp": 0.01261863, + "balance_loss_clip": 1.13748503, + "balance_loss_mlp": 1.03259945, + "epoch": 0.742860578368304, + "flos": 19900269686880.0, + "grad_norm": 2.2244352639745917, + "language_loss": 0.80303288, + "learning_rate": 6.542688368763034e-07, + "loss": 0.83056837, + "num_input_tokens_seen": 132753995, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.2890625, + "step": 6178, + "time_per_iteration": 3.014617919921875 + }, + { + "auxiliary_loss_clip": 0.01485488, + "auxiliary_loss_mlp": 0.01261037, + "balance_loss_clip": 1.13173807, + "balance_loss_mlp": 1.03081977, + "epoch": 0.742980821258943, + "flos": 24829538400000.0, + "grad_norm": 1.6539544576965557, + "language_loss": 0.77179193, + "learning_rate": 6.536926831855854e-07, + "loss": 0.79925716, + "num_input_tokens_seen": 132773160, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.29882812, + "step": 6179, + "time_per_iteration": 3.039006471633911 + }, + { + "auxiliary_loss_clip": 0.01482351, + "auxiliary_loss_mlp": 0.01253756, + "balance_loss_clip": 1.12707007, + "balance_loss_mlp": 1.02392077, + "epoch": 0.7431010641495821, + "flos": 25231011981120.0, + "grad_norm": 2.5305862769588203, + "language_loss": 0.73391676, + "learning_rate": 6.531167337286165e-07, + "loss": 0.76127779, + "num_input_tokens_seen": 132793180, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.296875, + "step": 6180, + "time_per_iteration": 2.9581260681152344 + }, + { + "auxiliary_loss_clip": 0.01490632, + "auxiliary_loss_mlp": 0.01261688, + "balance_loss_clip": 1.13547492, + "balance_loss_mlp": 1.03166151, + "epoch": 0.7432213070402213, + "flos": 21764533187520.0, + "grad_norm": 1.6354490196843086, + "language_loss": 0.79802787, + "learning_rate": 6.52540988592768e-07, + "loss": 0.82555103, + "num_input_tokens_seen": 132814200, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.29882812, + "step": 6181, + "time_per_iteration": 3.027723550796509 + }, + { + "auxiliary_loss_clip": 0.01490489, + "auxiliary_loss_mlp": 0.01257512, + "balance_loss_clip": 1.13640213, + "balance_loss_mlp": 1.03015602, + "epoch": 0.7433415499308603, + "flos": 14795619340320.0, + "grad_norm": 2.275241162035839, + "language_loss": 0.83742553, + "learning_rate": 6.519654478653814e-07, + "loss": 0.8649056, + "num_input_tokens_seen": 132832565, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.2734375, + "step": 6182, + "time_per_iteration": 2.9182896614074707 + }, + { + "auxiliary_loss_clip": 0.0145167, + "auxiliary_loss_mlp": 0.01193794, + "balance_loss_clip": 1.1153903, + "balance_loss_mlp": 1.00191498, + "epoch": 0.7434617928214994, + "flos": 67162604083200.0, + "grad_norm": 0.7501831985444458, + "language_loss": 0.55979711, + "learning_rate": 6.51390111633763e-07, + "loss": 0.58625174, + "num_input_tokens_seen": 132897840, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 1.91796875, + "step": 6183, + "time_per_iteration": 3.479855537414551 + }, + { + "auxiliary_loss_clip": 0.01495739, + "auxiliary_loss_mlp": 0.01255677, + "balance_loss_clip": 1.14188695, + "balance_loss_mlp": 1.02660453, + "epoch": 0.7435820357121385, + "flos": 27379853380800.0, + "grad_norm": 1.6510013167322766, + "language_loss": 0.76264811, + "learning_rate": 6.508149799851932e-07, + "loss": 0.79016232, + "num_input_tokens_seen": 132919505, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.2890625, + "step": 6184, + "time_per_iteration": 3.068869113922119 + }, + { + "auxiliary_loss_clip": 0.01486644, + "auxiliary_loss_mlp": 0.01251259, + "balance_loss_clip": 1.13389599, + "balance_loss_mlp": 1.0200882, + "epoch": 0.7437022786027776, + "flos": 23989914342720.0, + "grad_norm": 2.5247872667541547, + "language_loss": 0.61293, + "learning_rate": 6.502400530069183e-07, + "loss": 0.64030904, + "num_input_tokens_seen": 132939390, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.30859375, + "step": 6185, + "time_per_iteration": 3.033796548843384 + }, + { + "auxiliary_loss_clip": 0.01495049, + "auxiliary_loss_mlp": 0.01263881, + "balance_loss_clip": 1.14057684, + "balance_loss_mlp": 1.02889562, + "epoch": 0.7438225214934167, + "flos": 21868647079680.0, + "grad_norm": 1.816775828626621, + "language_loss": 0.68588978, + "learning_rate": 6.496653307861535e-07, + "loss": 0.71347916, + "num_input_tokens_seen": 132960060, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.34765625, + "step": 6186, + "time_per_iteration": 3.0307562351226807 + }, + { + "auxiliary_loss_clip": 0.0148888, + "auxiliary_loss_mlp": 0.01262022, + "balance_loss_clip": 1.13521385, + "balance_loss_mlp": 1.0300889, + "epoch": 0.7439427643840558, + "flos": 20232182293920.0, + "grad_norm": 2.4448305631103904, + "language_loss": 0.65989375, + "learning_rate": 6.490908134100857e-07, + "loss": 0.68740273, + "num_input_tokens_seen": 132978525, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.31640625, + "step": 6187, + "time_per_iteration": 3.763376474380493 + }, + { + "auxiliary_loss_clip": 0.01490246, + "auxiliary_loss_mlp": 0.01266105, + "balance_loss_clip": 1.13703632, + "balance_loss_mlp": 1.03283656, + "epoch": 0.7440630072746949, + "flos": 20852010478080.0, + "grad_norm": 2.0252048331298367, + "language_loss": 0.6964407, + "learning_rate": 6.48516500965866e-07, + "loss": 0.72400415, + "num_input_tokens_seen": 132998460, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.32617188, + "step": 6188, + "time_per_iteration": 2.942051410675049 + }, + { + "auxiliary_loss_clip": 0.01481301, + "auxiliary_loss_mlp": 0.01261823, + "balance_loss_clip": 1.12760043, + "balance_loss_mlp": 1.03122413, + "epoch": 0.7441832501653339, + "flos": 26506093615200.0, + "grad_norm": 1.6989308254473687, + "language_loss": 0.81646883, + "learning_rate": 6.479423935406192e-07, + "loss": 0.84390014, + "num_input_tokens_seen": 133018445, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.3046875, + "step": 6189, + "time_per_iteration": 3.1570005416870117 + }, + { + "auxiliary_loss_clip": 0.01449613, + "auxiliary_loss_mlp": 0.01193375, + "balance_loss_clip": 1.11247838, + "balance_loss_mlp": 1.00416565, + "epoch": 0.7443034930559731, + "flos": 68609274534720.0, + "grad_norm": 0.8418778634367999, + "language_loss": 0.61920995, + "learning_rate": 6.473684912214357e-07, + "loss": 0.64563984, + "num_input_tokens_seen": 133082005, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.890625, + "step": 6190, + "time_per_iteration": 4.492281675338745 + }, + { + "auxiliary_loss_clip": 0.01487968, + "auxiliary_loss_mlp": 0.01264489, + "balance_loss_clip": 1.13399792, + "balance_loss_mlp": 1.0356071, + "epoch": 0.7444237359466122, + "flos": 18656630861760.0, + "grad_norm": 17.095283586721376, + "language_loss": 0.69804394, + "learning_rate": 6.467947940953778e-07, + "loss": 0.72556853, + "num_input_tokens_seen": 133100530, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.28515625, + "step": 6191, + "time_per_iteration": 2.988680124282837 + }, + { + "auxiliary_loss_clip": 0.01487782, + "auxiliary_loss_mlp": 0.01246345, + "balance_loss_clip": 1.13497424, + "balance_loss_mlp": 1.01708174, + "epoch": 0.7445439788372512, + "flos": 22819667235840.0, + "grad_norm": 1.9225757469214155, + "language_loss": 0.7240659, + "learning_rate": 6.462213022494732e-07, + "loss": 0.75140715, + "num_input_tokens_seen": 133119775, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.29101562, + "step": 6192, + "time_per_iteration": 3.915804386138916 + }, + { + "auxiliary_loss_clip": 0.01444668, + "auxiliary_loss_mlp": 0.0119294, + "balance_loss_clip": 1.10809875, + "balance_loss_mlp": 1.00411224, + "epoch": 0.7446642217278904, + "flos": 67052687182560.0, + "grad_norm": 0.7929280402080919, + "language_loss": 0.61111546, + "learning_rate": 6.456480157707201e-07, + "loss": 0.63749158, + "num_input_tokens_seen": 133184550, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 1.88671875, + "step": 6193, + "time_per_iteration": 3.3836140632629395 + }, + { + "auxiliary_loss_clip": 0.01482385, + "auxiliary_loss_mlp": 0.01263174, + "balance_loss_clip": 1.12851906, + "balance_loss_mlp": 1.03257537, + "epoch": 0.7447844646185294, + "flos": 17419212254880.0, + "grad_norm": 2.6254288470513423, + "language_loss": 0.84965229, + "learning_rate": 6.450749347460866e-07, + "loss": 0.87710786, + "num_input_tokens_seen": 133201525, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.30273438, + "step": 6194, + "time_per_iteration": 2.9760560989379883 + }, + { + "auxiliary_loss_clip": 0.01487325, + "auxiliary_loss_mlp": 0.01263927, + "balance_loss_clip": 1.13351011, + "balance_loss_mlp": 1.03294683, + "epoch": 0.7449047075091685, + "flos": 26618362061760.0, + "grad_norm": 5.713739244634075, + "language_loss": 0.78995484, + "learning_rate": 6.445020592625083e-07, + "loss": 0.81746733, + "num_input_tokens_seen": 133222175, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.30859375, + "step": 6195, + "time_per_iteration": 3.0485730171203613 + }, + { + "auxiliary_loss_clip": 0.01479166, + "auxiliary_loss_mlp": 0.01254631, + "balance_loss_clip": 1.12508643, + "balance_loss_mlp": 1.02479601, + "epoch": 0.7450249503998077, + "flos": 14172036268320.0, + "grad_norm": 2.2918384782795624, + "language_loss": 0.79937816, + "learning_rate": 6.4392938940689e-07, + "loss": 0.82671607, + "num_input_tokens_seen": 133237590, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.296875, + "step": 6196, + "time_per_iteration": 3.041846752166748 + }, + { + "auxiliary_loss_clip": 0.01486688, + "auxiliary_loss_mlp": 0.01253097, + "balance_loss_clip": 1.13263345, + "balance_loss_mlp": 1.02268946, + "epoch": 0.7451451932904467, + "flos": 19608561293760.0, + "grad_norm": 2.438761355955325, + "language_loss": 0.71179169, + "learning_rate": 6.433569252661049e-07, + "loss": 0.73918951, + "num_input_tokens_seen": 133255590, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.3046875, + "step": 6197, + "time_per_iteration": 3.041513681411743 + }, + { + "auxiliary_loss_clip": 0.01484667, + "auxiliary_loss_mlp": 0.01262203, + "balance_loss_clip": 1.13160813, + "balance_loss_mlp": 1.03332102, + "epoch": 0.7452654361810858, + "flos": 12497074035840.0, + "grad_norm": 1.7971845647337747, + "language_loss": 0.71405482, + "learning_rate": 6.427846669269952e-07, + "loss": 0.7415235, + "num_input_tokens_seen": 133273210, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.28515625, + "step": 6198, + "time_per_iteration": 3.0561723709106445 + }, + { + "auxiliary_loss_clip": 0.01489954, + "auxiliary_loss_mlp": 0.01257108, + "balance_loss_clip": 1.13681924, + "balance_loss_mlp": 1.02727282, + "epoch": 0.7453856790717249, + "flos": 22129519514400.0, + "grad_norm": 2.969932981061699, + "language_loss": 0.82963604, + "learning_rate": 6.422126144763729e-07, + "loss": 0.85710669, + "num_input_tokens_seen": 133292600, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.29492188, + "step": 6199, + "time_per_iteration": 2.9362728595733643 + }, + { + "auxiliary_loss_clip": 0.01484848, + "auxiliary_loss_mlp": 0.01274088, + "balance_loss_clip": 1.129287, + "balance_loss_mlp": 1.04024696, + "epoch": 0.745505921962364, + "flos": 20012727774240.0, + "grad_norm": 2.745019662002683, + "language_loss": 0.77044916, + "learning_rate": 6.416407680010174e-07, + "loss": 0.79803854, + "num_input_tokens_seen": 133306960, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.33398438, + "step": 6200, + "time_per_iteration": 2.9282007217407227 + }, + { + "auxiliary_loss_clip": 0.0148649, + "auxiliary_loss_mlp": 0.01269799, + "balance_loss_clip": 1.13283634, + "balance_loss_mlp": 1.03519499, + "epoch": 0.745626164853003, + "flos": 24680024136000.0, + "grad_norm": 2.176268123166185, + "language_loss": 0.81012416, + "learning_rate": 6.410691275876774e-07, + "loss": 0.83768708, + "num_input_tokens_seen": 133326380, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.34179688, + "step": 6201, + "time_per_iteration": 4.011315107345581 + }, + { + "auxiliary_loss_clip": 0.01487684, + "auxiliary_loss_mlp": 0.0126713, + "balance_loss_clip": 1.13309729, + "balance_loss_mlp": 1.03615057, + "epoch": 0.7457464077436422, + "flos": 14540625770400.0, + "grad_norm": 2.2880504603879115, + "language_loss": 0.76862597, + "learning_rate": 6.404976933230704e-07, + "loss": 0.79617417, + "num_input_tokens_seen": 133342900, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.30859375, + "step": 6202, + "time_per_iteration": 2.9690756797790527 + }, + { + "auxiliary_loss_clip": 0.01492435, + "auxiliary_loss_mlp": 0.01259091, + "balance_loss_clip": 1.13769078, + "balance_loss_mlp": 1.02639437, + "epoch": 0.7458666506342813, + "flos": 34024023043200.0, + "grad_norm": 3.225941663869319, + "language_loss": 0.7289567, + "learning_rate": 6.399264652938813e-07, + "loss": 0.75647199, + "num_input_tokens_seen": 133363805, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.32226562, + "step": 6203, + "time_per_iteration": 3.2434322834014893 + }, + { + "auxiliary_loss_clip": 0.01484666, + "auxiliary_loss_mlp": 0.01255233, + "balance_loss_clip": 1.12947214, + "balance_loss_mlp": 1.02577901, + "epoch": 0.7459868935249203, + "flos": 24281281382400.0, + "grad_norm": 2.4046272544685547, + "language_loss": 0.74951977, + "learning_rate": 6.393554435867679e-07, + "loss": 0.77691877, + "num_input_tokens_seen": 133384655, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.29296875, + "step": 6204, + "time_per_iteration": 3.0220465660095215 + }, + { + "auxiliary_loss_clip": 0.0148622, + "auxiliary_loss_mlp": 0.01263821, + "balance_loss_clip": 1.13211703, + "balance_loss_mlp": 1.03265071, + "epoch": 0.7461071364155595, + "flos": 21910975270560.0, + "grad_norm": 2.475248416308501, + "language_loss": 0.84048438, + "learning_rate": 6.387846282883502e-07, + "loss": 0.86798483, + "num_input_tokens_seen": 133401185, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.30859375, + "step": 6205, + "time_per_iteration": 2.9925856590270996 + }, + { + "auxiliary_loss_clip": 0.01487942, + "auxiliary_loss_mlp": 0.01252151, + "balance_loss_clip": 1.1343143, + "balance_loss_mlp": 1.02098012, + "epoch": 0.7462273793061985, + "flos": 22891731468480.0, + "grad_norm": 2.1794302413268034, + "language_loss": 0.77233422, + "learning_rate": 6.38214019485223e-07, + "loss": 0.79973513, + "num_input_tokens_seen": 133420010, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.30859375, + "step": 6206, + "time_per_iteration": 3.023954153060913 + }, + { + "auxiliary_loss_clip": 0.01488011, + "auxiliary_loss_mlp": 0.01255179, + "balance_loss_clip": 1.13466072, + "balance_loss_mlp": 1.02496195, + "epoch": 0.7463476221968376, + "flos": 19970172014400.0, + "grad_norm": 2.0423071152171275, + "language_loss": 0.71674299, + "learning_rate": 6.376436172639461e-07, + "loss": 0.74417484, + "num_input_tokens_seen": 133437855, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.29882812, + "step": 6207, + "time_per_iteration": 3.0347139835357666 + }, + { + "auxiliary_loss_clip": 0.01483583, + "auxiliary_loss_mlp": 0.01256556, + "balance_loss_clip": 1.12908077, + "balance_loss_mlp": 1.02576673, + "epoch": 0.7464678650874768, + "flos": 16838488368000.0, + "grad_norm": 2.6730741239265465, + "language_loss": 0.65420496, + "learning_rate": 6.370734217110487e-07, + "loss": 0.68160629, + "num_input_tokens_seen": 133456600, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.3046875, + "step": 6208, + "time_per_iteration": 3.0100200176239014 + }, + { + "auxiliary_loss_clip": 0.0149142, + "auxiliary_loss_mlp": 0.01262283, + "balance_loss_clip": 1.13699555, + "balance_loss_mlp": 1.02615321, + "epoch": 0.7465881079781158, + "flos": 48104272435680.0, + "grad_norm": 1.5237623828629436, + "language_loss": 0.64273763, + "learning_rate": 6.36503432913031e-07, + "loss": 0.67027462, + "num_input_tokens_seen": 133479745, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.35742188, + "step": 6209, + "time_per_iteration": 3.197169542312622 + }, + { + "auxiliary_loss_clip": 0.01481713, + "auxiliary_loss_mlp": 0.01264174, + "balance_loss_clip": 1.1263864, + "balance_loss_mlp": 1.02976131, + "epoch": 0.7467083508687549, + "flos": 19679108400000.0, + "grad_norm": 2.9260227766248006, + "language_loss": 0.69051927, + "learning_rate": 6.359336509563569e-07, + "loss": 0.71797812, + "num_input_tokens_seen": 133495765, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.33789062, + "step": 6210, + "time_per_iteration": 2.958320140838623 + }, + { + "auxiliary_loss_clip": 0.0148414, + "auxiliary_loss_mlp": 0.01254001, + "balance_loss_clip": 1.13013613, + "balance_loss_mlp": 1.02416539, + "epoch": 0.7468285937593939, + "flos": 17897415232320.0, + "grad_norm": 2.1026509718374644, + "language_loss": 0.80822837, + "learning_rate": 6.353640759274641e-07, + "loss": 0.83560979, + "num_input_tokens_seen": 133514655, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.29882812, + "step": 6211, + "time_per_iteration": 3.0165514945983887 + }, + { + "auxiliary_loss_clip": 0.0148311, + "auxiliary_loss_mlp": 0.0126059, + "balance_loss_clip": 1.12853217, + "balance_loss_mlp": 1.03056383, + "epoch": 0.7469488366500331, + "flos": 23143273575840.0, + "grad_norm": 3.21086088297281, + "language_loss": 0.74320138, + "learning_rate": 6.347947079127556e-07, + "loss": 0.77063835, + "num_input_tokens_seen": 133532555, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.30078125, + "step": 6212, + "time_per_iteration": 2.9906861782073975 + }, + { + "auxiliary_loss_clip": 0.01487037, + "auxiliary_loss_mlp": 0.01268084, + "balance_loss_clip": 1.13392997, + "balance_loss_mlp": 1.03634095, + "epoch": 0.7470690795406721, + "flos": 16692728991840.0, + "grad_norm": 2.9843167197860447, + "language_loss": 0.76928407, + "learning_rate": 6.342255469986053e-07, + "loss": 0.79683524, + "num_input_tokens_seen": 133551300, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 2.31640625, + "step": 6213, + "time_per_iteration": 2.9658524990081787 + }, + { + "auxiliary_loss_clip": 0.01483982, + "auxiliary_loss_mlp": 0.01254186, + "balance_loss_clip": 1.13038635, + "balance_loss_mlp": 1.02091718, + "epoch": 0.7471893224313112, + "flos": 25195093649280.0, + "grad_norm": 2.0824955360583597, + "language_loss": 0.76398849, + "learning_rate": 6.336565932713533e-07, + "loss": 0.79137015, + "num_input_tokens_seen": 133570725, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.328125, + "step": 6214, + "time_per_iteration": 3.8720297813415527 + }, + { + "auxiliary_loss_clip": 0.01483144, + "auxiliary_loss_mlp": 0.01260313, + "balance_loss_clip": 1.12848675, + "balance_loss_mlp": 1.02856982, + "epoch": 0.7473095653219504, + "flos": 22528300196160.0, + "grad_norm": 3.4029409298301103, + "language_loss": 0.7778244, + "learning_rate": 6.330878468173088e-07, + "loss": 0.80525899, + "num_input_tokens_seen": 133590790, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.31054688, + "step": 6215, + "time_per_iteration": 3.0114614963531494 + }, + { + "auxiliary_loss_clip": 0.01482387, + "auxiliary_loss_mlp": 0.01258976, + "balance_loss_clip": 1.12733555, + "balance_loss_mlp": 1.02647066, + "epoch": 0.7474298082125894, + "flos": 18115959476160.0, + "grad_norm": 1.8586410610220891, + "language_loss": 0.73172641, + "learning_rate": 6.32519307722752e-07, + "loss": 0.75914001, + "num_input_tokens_seen": 133608685, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.32226562, + "step": 6216, + "time_per_iteration": 3.047304630279541 + }, + { + "auxiliary_loss_clip": 0.01447424, + "auxiliary_loss_mlp": 0.01197792, + "balance_loss_clip": 1.10962498, + "balance_loss_mlp": 1.00896454, + "epoch": 0.7475500511032285, + "flos": 62093416930560.0, + "grad_norm": 0.9100827813207439, + "language_loss": 0.54946482, + "learning_rate": 6.31950976073929e-07, + "loss": 0.57591701, + "num_input_tokens_seen": 133662775, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.88671875, + "step": 6217, + "time_per_iteration": 3.4936299324035645 + }, + { + "auxiliary_loss_clip": 0.01488925, + "auxiliary_loss_mlp": 0.01258261, + "balance_loss_clip": 1.133582, + "balance_loss_mlp": 1.0295701, + "epoch": 0.7476702939938676, + "flos": 17787536259840.0, + "grad_norm": 2.0502931619844595, + "language_loss": 0.80852652, + "learning_rate": 6.31382851957055e-07, + "loss": 0.83599836, + "num_input_tokens_seen": 133679595, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.28515625, + "step": 6218, + "time_per_iteration": 3.993798017501831 + }, + { + "auxiliary_loss_clip": 0.01487605, + "auxiliary_loss_mlp": 0.01262603, + "balance_loss_clip": 1.1332742, + "balance_loss_mlp": 1.03143275, + "epoch": 0.7477905368845067, + "flos": 27930424016160.0, + "grad_norm": 3.498858529999373, + "language_loss": 0.71911681, + "learning_rate": 6.308149354583143e-07, + "loss": 0.74661893, + "num_input_tokens_seen": 133699000, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.30859375, + "step": 6219, + "time_per_iteration": 3.0598971843719482 + }, + { + "auxiliary_loss_clip": 0.01485951, + "auxiliary_loss_mlp": 0.01265167, + "balance_loss_clip": 1.13212371, + "balance_loss_mlp": 1.03266072, + "epoch": 0.7479107797751458, + "flos": 26872824637440.0, + "grad_norm": 2.2717044951135956, + "language_loss": 0.81936216, + "learning_rate": 6.302472266638586e-07, + "loss": 0.8468734, + "num_input_tokens_seen": 133719540, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.32226562, + "step": 6220, + "time_per_iteration": 3.824157953262329 + }, + { + "auxiliary_loss_clip": 0.01489839, + "auxiliary_loss_mlp": 0.01263288, + "balance_loss_clip": 1.1359539, + "balance_loss_mlp": 1.02544177, + "epoch": 0.7480310226657849, + "flos": 33945928233120.0, + "grad_norm": 2.719570807433969, + "language_loss": 0.7004233, + "learning_rate": 6.296797256598101e-07, + "loss": 0.72795457, + "num_input_tokens_seen": 133741020, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.37304688, + "step": 6221, + "time_per_iteration": 3.0624020099639893 + }, + { + "auxiliary_loss_clip": 0.01488132, + "auxiliary_loss_mlp": 0.01250344, + "balance_loss_clip": 1.13525891, + "balance_loss_mlp": 1.02146196, + "epoch": 0.748151265556424, + "flos": 24828855693120.0, + "grad_norm": 2.1319858904707516, + "language_loss": 0.81562787, + "learning_rate": 6.291124325322576e-07, + "loss": 0.84301269, + "num_input_tokens_seen": 133761145, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.28710938, + "step": 6222, + "time_per_iteration": 2.945483922958374 + }, + { + "auxiliary_loss_clip": 0.01488337, + "auxiliary_loss_mlp": 0.01254119, + "balance_loss_clip": 1.13368881, + "balance_loss_mlp": 1.02275777, + "epoch": 0.748271508447063, + "flos": 38402417695680.0, + "grad_norm": 2.0389789193305123, + "language_loss": 0.62479955, + "learning_rate": 6.285453473672595e-07, + "loss": 0.65222406, + "num_input_tokens_seen": 133783715, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.30859375, + "step": 6223, + "time_per_iteration": 3.1480677127838135 + }, + { + "auxiliary_loss_clip": 0.01479222, + "auxiliary_loss_mlp": 0.01258132, + "balance_loss_clip": 1.12531137, + "balance_loss_mlp": 1.024863, + "epoch": 0.7483917513377022, + "flos": 21543902894880.0, + "grad_norm": 3.1192979054985175, + "language_loss": 0.75732845, + "learning_rate": 6.279784702508415e-07, + "loss": 0.78470194, + "num_input_tokens_seen": 133804465, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.328125, + "step": 6224, + "time_per_iteration": 2.9322221279144287 + }, + { + "auxiliary_loss_clip": 0.01440526, + "auxiliary_loss_mlp": 0.01195694, + "balance_loss_clip": 1.10393572, + "balance_loss_mlp": 1.00610352, + "epoch": 0.7485119942283412, + "flos": 62321936280480.0, + "grad_norm": 0.7969399287660439, + "language_loss": 0.58588612, + "learning_rate": 6.274118012689979e-07, + "loss": 0.6122483, + "num_input_tokens_seen": 133866365, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 1.89453125, + "step": 6225, + "time_per_iteration": 3.549891710281372 + }, + { + "auxiliary_loss_clip": 0.01480577, + "auxiliary_loss_mlp": 0.01261008, + "balance_loss_clip": 1.12550616, + "balance_loss_mlp": 1.03098142, + "epoch": 0.7486322371189803, + "flos": 29940371036640.0, + "grad_norm": 2.1255388901097554, + "language_loss": 0.68392634, + "learning_rate": 6.268453405076943e-07, + "loss": 0.71134222, + "num_input_tokens_seen": 133888760, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.29492188, + "step": 6226, + "time_per_iteration": 3.001532554626465 + }, + { + "auxiliary_loss_clip": 0.01481575, + "auxiliary_loss_mlp": 0.01257883, + "balance_loss_clip": 1.12801456, + "balance_loss_mlp": 1.02652168, + "epoch": 0.7487524800096195, + "flos": 18951221795040.0, + "grad_norm": 2.1062487320525354, + "language_loss": 0.82354426, + "learning_rate": 6.262790880528592e-07, + "loss": 0.8509388, + "num_input_tokens_seen": 133906380, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.30664062, + "step": 6227, + "time_per_iteration": 2.9326562881469727 + }, + { + "auxiliary_loss_clip": 0.01481878, + "auxiliary_loss_mlp": 0.01266067, + "balance_loss_clip": 1.1257143, + "balance_loss_mlp": 1.03546906, + "epoch": 0.7488727229002585, + "flos": 18699376262400.0, + "grad_norm": 2.5251142141678478, + "language_loss": 0.79714704, + "learning_rate": 6.257130439903951e-07, + "loss": 0.82462656, + "num_input_tokens_seen": 133922875, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.30078125, + "step": 6228, + "time_per_iteration": 3.0057315826416016 + }, + { + "auxiliary_loss_clip": 0.01486629, + "auxiliary_loss_mlp": 0.01262992, + "balance_loss_clip": 1.133641, + "balance_loss_mlp": 1.03010488, + "epoch": 0.7489929657908976, + "flos": 23625648650880.0, + "grad_norm": 1.8133426432880928, + "language_loss": 0.81456161, + "learning_rate": 6.251472084061695e-07, + "loss": 0.84205782, + "num_input_tokens_seen": 133941795, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 2.32617188, + "step": 6229, + "time_per_iteration": 3.985849142074585 + }, + { + "auxiliary_loss_clip": 0.01482219, + "auxiliary_loss_mlp": 0.01261427, + "balance_loss_clip": 1.12737083, + "balance_loss_mlp": 1.03178251, + "epoch": 0.7491132086815367, + "flos": 20553247447200.0, + "grad_norm": 2.5075591071897114, + "language_loss": 0.88839966, + "learning_rate": 6.245815813860191e-07, + "loss": 0.91583616, + "num_input_tokens_seen": 133957305, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.29492188, + "step": 6230, + "time_per_iteration": 3.1363580226898193 + }, + { + "auxiliary_loss_clip": 0.01482854, + "auxiliary_loss_mlp": 0.01264781, + "balance_loss_clip": 1.12830472, + "balance_loss_mlp": 1.03284764, + "epoch": 0.7492334515721758, + "flos": 23005251544320.0, + "grad_norm": 2.0563177581115584, + "language_loss": 0.70547044, + "learning_rate": 6.240161630157495e-07, + "loss": 0.73294675, + "num_input_tokens_seen": 133976660, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.31640625, + "step": 6231, + "time_per_iteration": 3.1259543895721436 + }, + { + "auxiliary_loss_clip": 0.01484665, + "auxiliary_loss_mlp": 0.01266192, + "balance_loss_clip": 1.13057852, + "balance_loss_mlp": 1.03196955, + "epoch": 0.7493536944628149, + "flos": 16400679245280.0, + "grad_norm": 2.1339787072889593, + "language_loss": 0.7009263, + "learning_rate": 6.23450953381133e-07, + "loss": 0.72843486, + "num_input_tokens_seen": 133994750, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.33789062, + "step": 6232, + "time_per_iteration": 3.0967326164245605 + }, + { + "auxiliary_loss_clip": 0.01478787, + "auxiliary_loss_mlp": 0.01250901, + "balance_loss_clip": 1.12322152, + "balance_loss_mlp": 1.02182889, + "epoch": 0.749473937353454, + "flos": 15340235254560.0, + "grad_norm": 1.9804651147571404, + "language_loss": 0.67907715, + "learning_rate": 6.228859525679131e-07, + "loss": 0.70637405, + "num_input_tokens_seen": 134009165, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.2890625, + "step": 6233, + "time_per_iteration": 3.090291738510132 + }, + { + "auxiliary_loss_clip": 0.01485086, + "auxiliary_loss_mlp": 0.01257488, + "balance_loss_clip": 1.13150406, + "balance_loss_mlp": 1.02498245, + "epoch": 0.7495941802440931, + "flos": 18953042346720.0, + "grad_norm": 3.4067930485498827, + "language_loss": 0.7970227, + "learning_rate": 6.223211606617986e-07, + "loss": 0.82444847, + "num_input_tokens_seen": 134027585, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.3203125, + "step": 6234, + "time_per_iteration": 3.0406203269958496 + }, + { + "auxiliary_loss_clip": 0.01487023, + "auxiliary_loss_mlp": 0.01257721, + "balance_loss_clip": 1.1321578, + "balance_loss_mlp": 1.03055573, + "epoch": 0.7497144231347321, + "flos": 22494695482080.0, + "grad_norm": 1.8684516467578363, + "language_loss": 0.84099233, + "learning_rate": 6.217565777484701e-07, + "loss": 0.86843979, + "num_input_tokens_seen": 134046680, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.2734375, + "step": 6235, + "time_per_iteration": 3.0481772422790527 + }, + { + "auxiliary_loss_clip": 0.0148682, + "auxiliary_loss_mlp": 0.01256355, + "balance_loss_clip": 1.13278055, + "balance_loss_mlp": 1.02556586, + "epoch": 0.7498346660253713, + "flos": 24245818188480.0, + "grad_norm": 2.7885885737755927, + "language_loss": 0.80181849, + "learning_rate": 6.211922039135722e-07, + "loss": 0.82925034, + "num_input_tokens_seen": 134066825, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.30664062, + "step": 6236, + "time_per_iteration": 3.0630083084106445 + }, + { + "auxiliary_loss_clip": 0.01485785, + "auxiliary_loss_mlp": 0.01264317, + "balance_loss_clip": 1.13075495, + "balance_loss_mlp": 1.03371859, + "epoch": 0.7499549089160104, + "flos": 24389263946880.0, + "grad_norm": 5.498369279365099, + "language_loss": 0.81211388, + "learning_rate": 6.206280392427201e-07, + "loss": 0.83961487, + "num_input_tokens_seen": 134086410, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.3046875, + "step": 6237, + "time_per_iteration": 3.0358479022979736 + }, + { + "auxiliary_loss_clip": 0.01485763, + "auxiliary_loss_mlp": 0.01258558, + "balance_loss_clip": 1.13096654, + "balance_loss_mlp": 1.02872241, + "epoch": 0.7500751518066494, + "flos": 34060131015840.0, + "grad_norm": 1.6191016100266384, + "language_loss": 0.73691803, + "learning_rate": 6.200640838214983e-07, + "loss": 0.76436126, + "num_input_tokens_seen": 134109185, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.296875, + "step": 6238, + "time_per_iteration": 3.035550594329834 + }, + { + "auxiliary_loss_clip": 0.0148206, + "auxiliary_loss_mlp": 0.01257589, + "balance_loss_clip": 1.1277982, + "balance_loss_mlp": 1.02699018, + "epoch": 0.7501953946972886, + "flos": 18845590776480.0, + "grad_norm": 1.8660986404969333, + "language_loss": 0.66750622, + "learning_rate": 6.195003377354578e-07, + "loss": 0.69490272, + "num_input_tokens_seen": 134128455, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.30273438, + "step": 6239, + "time_per_iteration": 2.9992306232452393 + }, + { + "auxiliary_loss_clip": 0.01482969, + "auxiliary_loss_mlp": 0.01268789, + "balance_loss_clip": 1.12832427, + "balance_loss_mlp": 1.03704596, + "epoch": 0.7503156375879276, + "flos": 20259263364480.0, + "grad_norm": 2.629697467221053, + "language_loss": 0.73210645, + "learning_rate": 6.189368010701183e-07, + "loss": 0.759624, + "num_input_tokens_seen": 134145515, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.31445312, + "step": 6240, + "time_per_iteration": 2.9685728549957275 + }, + { + "auxiliary_loss_clip": 0.01484088, + "auxiliary_loss_mlp": 0.01253937, + "balance_loss_clip": 1.12953448, + "balance_loss_mlp": 1.02257538, + "epoch": 0.7504358804785667, + "flos": 13481736834240.0, + "grad_norm": 2.1209515310601414, + "language_loss": 0.76356381, + "learning_rate": 6.183734739109683e-07, + "loss": 0.7909441, + "num_input_tokens_seen": 134163335, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.31054688, + "step": 6241, + "time_per_iteration": 3.018311023712158 + }, + { + "auxiliary_loss_clip": 0.01486137, + "auxiliary_loss_mlp": 0.01269387, + "balance_loss_clip": 1.1303196, + "balance_loss_mlp": 1.03745377, + "epoch": 0.7505561233692057, + "flos": 29463647257440.0, + "grad_norm": 2.4211944666072283, + "language_loss": 0.68788809, + "learning_rate": 6.178103563434629e-07, + "loss": 0.71544331, + "num_input_tokens_seen": 134182335, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.31835938, + "step": 6242, + "time_per_iteration": 4.0400071144104 + }, + { + "auxiliary_loss_clip": 0.01487227, + "auxiliary_loss_mlp": 0.01263092, + "balance_loss_clip": 1.13345671, + "balance_loss_mlp": 1.03134918, + "epoch": 0.7506763662598449, + "flos": 20304512023680.0, + "grad_norm": 1.8776815079821694, + "language_loss": 0.83737946, + "learning_rate": 6.172474484530283e-07, + "loss": 0.86488271, + "num_input_tokens_seen": 134201070, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.31640625, + "step": 6243, + "time_per_iteration": 3.0111865997314453 + }, + { + "auxiliary_loss_clip": 0.01486268, + "auxiliary_loss_mlp": 0.01257185, + "balance_loss_clip": 1.13087606, + "balance_loss_mlp": 1.02563334, + "epoch": 0.750796609150484, + "flos": 37233915284160.0, + "grad_norm": 1.81365762921473, + "language_loss": 0.76048046, + "learning_rate": 6.166847503250563e-07, + "loss": 0.78791499, + "num_input_tokens_seen": 134223310, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.3125, + "step": 6244, + "time_per_iteration": 3.1581790447235107 + }, + { + "auxiliary_loss_clip": 0.01484094, + "auxiliary_loss_mlp": 0.01257713, + "balance_loss_clip": 1.12970185, + "balance_loss_mlp": 1.02749562, + "epoch": 0.750916852041123, + "flos": 19611443833920.0, + "grad_norm": 3.734735656692399, + "language_loss": 0.78904355, + "learning_rate": 6.161222620449078e-07, + "loss": 0.81646162, + "num_input_tokens_seen": 134242085, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.30078125, + "step": 6245, + "time_per_iteration": 3.9820432662963867 + }, + { + "auxiliary_loss_clip": 0.01482547, + "auxiliary_loss_mlp": 0.01261401, + "balance_loss_clip": 1.12745142, + "balance_loss_mlp": 1.03004003, + "epoch": 0.7510370949317622, + "flos": 25114685221440.0, + "grad_norm": 2.5660835969056315, + "language_loss": 0.80108136, + "learning_rate": 6.155599836979117e-07, + "loss": 0.82852083, + "num_input_tokens_seen": 134260770, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.31054688, + "step": 6246, + "time_per_iteration": 3.0208780765533447 + }, + { + "auxiliary_loss_clip": 0.01483507, + "auxiliary_loss_mlp": 0.01262934, + "balance_loss_clip": 1.12802172, + "balance_loss_mlp": 1.03157234, + "epoch": 0.7511573378224012, + "flos": 19064286732960.0, + "grad_norm": 2.183058371300515, + "language_loss": 0.81782568, + "learning_rate": 6.149979153693649e-07, + "loss": 0.84529006, + "num_input_tokens_seen": 134278025, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.31054688, + "step": 6247, + "time_per_iteration": 3.8481431007385254 + }, + { + "auxiliary_loss_clip": 0.01484446, + "auxiliary_loss_mlp": 0.01264352, + "balance_loss_clip": 1.12851906, + "balance_loss_mlp": 1.03260922, + "epoch": 0.7512775807130403, + "flos": 19939715337600.0, + "grad_norm": 2.634863280270339, + "language_loss": 0.76522875, + "learning_rate": 6.144360571445343e-07, + "loss": 0.79271668, + "num_input_tokens_seen": 134297170, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.31054688, + "step": 6248, + "time_per_iteration": 2.999870538711548 + }, + { + "auxiliary_loss_clip": 0.01483306, + "auxiliary_loss_mlp": 0.01265857, + "balance_loss_clip": 1.12764406, + "balance_loss_mlp": 1.03544891, + "epoch": 0.7513978236036795, + "flos": 20741979792960.0, + "grad_norm": 1.731358445911716, + "language_loss": 0.8011595, + "learning_rate": 6.138744091086509e-07, + "loss": 0.82865119, + "num_input_tokens_seen": 134316755, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.30078125, + "step": 6249, + "time_per_iteration": 3.006127119064331 + }, + { + "auxiliary_loss_clip": 0.01492997, + "auxiliary_loss_mlp": 0.01255548, + "balance_loss_clip": 1.1393342, + "balance_loss_mlp": 1.02685666, + "epoch": 0.7515180664943185, + "flos": 27565627330080.0, + "grad_norm": 2.925614614515602, + "language_loss": 0.7317102, + "learning_rate": 6.133129713469183e-07, + "loss": 0.75919557, + "num_input_tokens_seen": 134335960, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.28710938, + "step": 6250, + "time_per_iteration": 3.0734775066375732 + }, + { + "auxiliary_loss_clip": 0.01486221, + "auxiliary_loss_mlp": 0.01266569, + "balance_loss_clip": 1.13074923, + "balance_loss_mlp": 1.03406334, + "epoch": 0.7516383093849576, + "flos": 33806047721760.0, + "grad_norm": 4.967149623672474, + "language_loss": 0.6404891, + "learning_rate": 6.127517439445053e-07, + "loss": 0.66801703, + "num_input_tokens_seen": 134356805, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.31835938, + "step": 6251, + "time_per_iteration": 3.0819790363311768 + }, + { + "auxiliary_loss_clip": 0.01483658, + "auxiliary_loss_mlp": 0.01249689, + "balance_loss_clip": 1.13007665, + "balance_loss_mlp": 1.02195132, + "epoch": 0.7517585522755967, + "flos": 29748414797280.0, + "grad_norm": 4.878887344904701, + "language_loss": 0.81418979, + "learning_rate": 6.121907269865498e-07, + "loss": 0.84152329, + "num_input_tokens_seen": 134376295, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 2.27929688, + "step": 6252, + "time_per_iteration": 3.095261812210083 + }, + { + "auxiliary_loss_clip": 0.0144528, + "auxiliary_loss_mlp": 0.01196083, + "balance_loss_clip": 1.10783505, + "balance_loss_mlp": 1.00687408, + "epoch": 0.7518787951662358, + "flos": 69814529697600.0, + "grad_norm": 0.925505974152344, + "language_loss": 0.67162335, + "learning_rate": 6.116299205581577e-07, + "loss": 0.69803703, + "num_input_tokens_seen": 134431125, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.890625, + "step": 6253, + "time_per_iteration": 3.400841236114502 + }, + { + "auxiliary_loss_clip": 0.01486338, + "auxiliary_loss_mlp": 0.01277476, + "balance_loss_clip": 1.13146996, + "balance_loss_mlp": 1.04420757, + "epoch": 0.7519990380568748, + "flos": 34206042104640.0, + "grad_norm": 1.8732028875335915, + "language_loss": 0.6842891, + "learning_rate": 6.110693247444018e-07, + "loss": 0.71192724, + "num_input_tokens_seen": 134452960, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.328125, + "step": 6254, + "time_per_iteration": 3.080209255218506 + }, + { + "auxiliary_loss_clip": 0.01481386, + "auxiliary_loss_mlp": 0.0125316, + "balance_loss_clip": 1.12701631, + "balance_loss_mlp": 1.02504134, + "epoch": 0.752119280947514, + "flos": 21727856292480.0, + "grad_norm": 1.9502676377638652, + "language_loss": 0.82469481, + "learning_rate": 6.105089396303258e-07, + "loss": 0.85204029, + "num_input_tokens_seen": 134471350, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.27929688, + "step": 6255, + "time_per_iteration": 3.0070905685424805 + }, + { + "auxiliary_loss_clip": 0.01486765, + "auxiliary_loss_mlp": 0.01260907, + "balance_loss_clip": 1.13307011, + "balance_loss_mlp": 1.0274471, + "epoch": 0.7522395238381531, + "flos": 32745452018400.0, + "grad_norm": 2.0401519039414033, + "language_loss": 0.7590003, + "learning_rate": 6.099487653009383e-07, + "loss": 0.78647697, + "num_input_tokens_seen": 134490695, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.328125, + "step": 6256, + "time_per_iteration": 3.0335991382598877 + }, + { + "auxiliary_loss_clip": 0.01479074, + "auxiliary_loss_mlp": 0.0125421, + "balance_loss_clip": 1.12330556, + "balance_loss_mlp": 1.02532768, + "epoch": 0.7523597667287921, + "flos": 23478220435680.0, + "grad_norm": 2.3655578293664243, + "language_loss": 0.82999396, + "learning_rate": 6.093888018412192e-07, + "loss": 0.85732675, + "num_input_tokens_seen": 134506885, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.28710938, + "step": 6257, + "time_per_iteration": 3.847524642944336 + }, + { + "auxiliary_loss_clip": 0.01443688, + "auxiliary_loss_mlp": 0.01194107, + "balance_loss_clip": 1.1065321, + "balance_loss_mlp": 1.00527954, + "epoch": 0.7524800096194313, + "flos": 67354256897280.0, + "grad_norm": 0.7146343558743851, + "language_loss": 0.54615533, + "learning_rate": 6.088290493361125e-07, + "loss": 0.57253331, + "num_input_tokens_seen": 134571770, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.88671875, + "step": 6258, + "time_per_iteration": 3.5484297275543213 + }, + { + "auxiliary_loss_clip": 0.01483177, + "auxiliary_loss_mlp": 0.0125859, + "balance_loss_clip": 1.12864673, + "balance_loss_mlp": 1.02513051, + "epoch": 0.7526002525100703, + "flos": 13007743882560.0, + "grad_norm": 2.2902078694943713, + "language_loss": 0.71424234, + "learning_rate": 6.082695078705322e-07, + "loss": 0.74166006, + "num_input_tokens_seen": 134589250, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.33007812, + "step": 6259, + "time_per_iteration": 2.984159469604492 + }, + { + "auxiliary_loss_clip": 0.01484114, + "auxiliary_loss_mlp": 0.01264785, + "balance_loss_clip": 1.12981391, + "balance_loss_mlp": 1.03094375, + "epoch": 0.7527204954007094, + "flos": 21399357219840.0, + "grad_norm": 4.976765598596939, + "language_loss": 0.68857116, + "learning_rate": 6.077101775293618e-07, + "loss": 0.71606016, + "num_input_tokens_seen": 134608075, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.33398438, + "step": 6260, + "time_per_iteration": 3.023317813873291 + }, + { + "auxiliary_loss_clip": 0.01480835, + "auxiliary_loss_mlp": 0.01260025, + "balance_loss_clip": 1.12584257, + "balance_loss_mlp": 1.02847302, + "epoch": 0.7528407382913486, + "flos": 18948870249120.0, + "grad_norm": 2.4017380916112345, + "language_loss": 0.82734859, + "learning_rate": 6.071510583974504e-07, + "loss": 0.85475719, + "num_input_tokens_seen": 134623260, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.31054688, + "step": 6261, + "time_per_iteration": 2.980097770690918 + }, + { + "auxiliary_loss_clip": 0.0148472, + "auxiliary_loss_mlp": 0.01265628, + "balance_loss_clip": 1.1316359, + "balance_loss_mlp": 1.03350413, + "epoch": 0.7529609811819876, + "flos": 15233618103840.0, + "grad_norm": 1.9106946811980052, + "language_loss": 0.72365552, + "learning_rate": 6.065921505596161e-07, + "loss": 0.75115895, + "num_input_tokens_seen": 134641540, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.31835938, + "step": 6262, + "time_per_iteration": 2.952582359313965 + }, + { + "auxiliary_loss_clip": 0.01483019, + "auxiliary_loss_mlp": 0.01254722, + "balance_loss_clip": 1.12789416, + "balance_loss_mlp": 1.02564955, + "epoch": 0.7530812240726267, + "flos": 19356526120320.0, + "grad_norm": 2.2477258111103002, + "language_loss": 0.77035165, + "learning_rate": 6.060334541006445e-07, + "loss": 0.79772907, + "num_input_tokens_seen": 134660035, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.29296875, + "step": 6263, + "time_per_iteration": 2.9838039875030518 + }, + { + "auxiliary_loss_clip": 0.01483712, + "auxiliary_loss_mlp": 0.01260411, + "balance_loss_clip": 1.12935162, + "balance_loss_mlp": 1.03095746, + "epoch": 0.7532014669632658, + "flos": 27750870285120.0, + "grad_norm": 1.5307228066621854, + "language_loss": 0.689147, + "learning_rate": 6.05474969105289e-07, + "loss": 0.71658826, + "num_input_tokens_seen": 134683025, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.29296875, + "step": 6264, + "time_per_iteration": 3.0248053073883057 + }, + { + "auxiliary_loss_clip": 0.01487356, + "auxiliary_loss_mlp": 0.01262881, + "balance_loss_clip": 1.13277411, + "balance_loss_mlp": 1.03323674, + "epoch": 0.7533217098539049, + "flos": 14139190117440.0, + "grad_norm": 2.5022152641512734, + "language_loss": 0.74114347, + "learning_rate": 6.049166956582725e-07, + "loss": 0.76864588, + "num_input_tokens_seen": 134701290, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.29296875, + "step": 6265, + "time_per_iteration": 3.0436160564422607 + }, + { + "auxiliary_loss_clip": 0.01481714, + "auxiliary_loss_mlp": 0.01255614, + "balance_loss_clip": 1.12623572, + "balance_loss_mlp": 1.02520609, + "epoch": 0.753441952744544, + "flos": 26431298555040.0, + "grad_norm": 2.074650442478647, + "language_loss": 0.8800239, + "learning_rate": 6.043586338442841e-07, + "loss": 0.90739727, + "num_input_tokens_seen": 134720345, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.30078125, + "step": 6266, + "time_per_iteration": 2.966765880584717 + }, + { + "auxiliary_loss_clip": 0.01479959, + "auxiliary_loss_mlp": 0.01267803, + "balance_loss_clip": 1.12493956, + "balance_loss_mlp": 1.04159093, + "epoch": 0.7535621956351831, + "flos": 23880907717920.0, + "grad_norm": 1.7836675682779535, + "language_loss": 0.73338568, + "learning_rate": 6.038007837479815e-07, + "loss": 0.76086324, + "num_input_tokens_seen": 134741450, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.26367188, + "step": 6267, + "time_per_iteration": 2.991797685623169 + }, + { + "auxiliary_loss_clip": 0.01480581, + "auxiliary_loss_mlp": 0.01266868, + "balance_loss_clip": 1.12612987, + "balance_loss_mlp": 1.03874898, + "epoch": 0.7536824385258222, + "flos": 21797872404480.0, + "grad_norm": 2.1530483847957975, + "language_loss": 0.64393556, + "learning_rate": 6.032431454539897e-07, + "loss": 0.67141008, + "num_input_tokens_seen": 134760295, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.28125, + "step": 6268, + "time_per_iteration": 3.0001232624053955 + }, + { + "auxiliary_loss_clip": 0.01487675, + "auxiliary_loss_mlp": 0.01258213, + "balance_loss_clip": 1.13331318, + "balance_loss_mlp": 1.02799571, + "epoch": 0.7538026814164612, + "flos": 28914100682400.0, + "grad_norm": 2.42722930435046, + "language_loss": 0.81853491, + "learning_rate": 6.026857190469014e-07, + "loss": 0.84599376, + "num_input_tokens_seen": 134782050, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.30078125, + "step": 6269, + "time_per_iteration": 3.814525604248047 + }, + { + "auxiliary_loss_clip": 0.01479565, + "auxiliary_loss_mlp": 0.01264977, + "balance_loss_clip": 1.12518859, + "balance_loss_mlp": 1.03685796, + "epoch": 0.7539229243071004, + "flos": 21106776479040.0, + "grad_norm": 2.307941688156501, + "language_loss": 0.74492228, + "learning_rate": 6.0212850461128e-07, + "loss": 0.77236778, + "num_input_tokens_seen": 134801170, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.28320312, + "step": 6270, + "time_per_iteration": 3.027221918106079 + }, + { + "auxiliary_loss_clip": 0.01480305, + "auxiliary_loss_mlp": 0.01264482, + "balance_loss_clip": 1.12531722, + "balance_loss_mlp": 1.03350186, + "epoch": 0.7540431671977395, + "flos": 15160264313760.0, + "grad_norm": 2.5004882101402557, + "language_loss": 0.74741089, + "learning_rate": 6.015715022316516e-07, + "loss": 0.77485877, + "num_input_tokens_seen": 134819150, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.3046875, + "step": 6271, + "time_per_iteration": 4.149068593978882 + }, + { + "auxiliary_loss_clip": 0.01483062, + "auxiliary_loss_mlp": 0.01267388, + "balance_loss_clip": 1.1286999, + "balance_loss_mlp": 1.03488278, + "epoch": 0.7541634100883785, + "flos": 18772578339840.0, + "grad_norm": 2.537577239124258, + "language_loss": 0.78003931, + "learning_rate": 6.010147119925154e-07, + "loss": 0.80754381, + "num_input_tokens_seen": 134836905, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.3203125, + "step": 6272, + "time_per_iteration": 3.114717960357666 + }, + { + "auxiliary_loss_clip": 0.01481988, + "auxiliary_loss_mlp": 0.01257348, + "balance_loss_clip": 1.12789202, + "balance_loss_mlp": 1.02961016, + "epoch": 0.7542836529790176, + "flos": 20596865195520.0, + "grad_norm": 1.9313422627976546, + "language_loss": 0.66226137, + "learning_rate": 6.004581339783348e-07, + "loss": 0.68965471, + "num_input_tokens_seen": 134855225, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.27734375, + "step": 6273, + "time_per_iteration": 3.0557892322540283 + }, + { + "auxiliary_loss_clip": 0.01486867, + "auxiliary_loss_mlp": 0.01270212, + "balance_loss_clip": 1.13300228, + "balance_loss_mlp": 1.03541756, + "epoch": 0.7544038958696567, + "flos": 19096905314880.0, + "grad_norm": 3.7529591341901067, + "language_loss": 0.68257898, + "learning_rate": 5.999017682735425e-07, + "loss": 0.71014977, + "num_input_tokens_seen": 134871615, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.34375, + "step": 6274, + "time_per_iteration": 2.9630374908447266 + }, + { + "auxiliary_loss_clip": 0.01483766, + "auxiliary_loss_mlp": 0.01257882, + "balance_loss_clip": 1.12987661, + "balance_loss_mlp": 1.02556694, + "epoch": 0.7545241387602958, + "flos": 31726046661120.0, + "grad_norm": 2.06131010067789, + "language_loss": 0.66455615, + "learning_rate": 5.993456149625387e-07, + "loss": 0.69197261, + "num_input_tokens_seen": 134892765, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.32226562, + "step": 6275, + "time_per_iteration": 3.955824613571167 + }, + { + "auxiliary_loss_clip": 0.01479063, + "auxiliary_loss_mlp": 0.01253479, + "balance_loss_clip": 1.12396634, + "balance_loss_mlp": 1.02516937, + "epoch": 0.7546443816509348, + "flos": 20298215949120.0, + "grad_norm": 1.6820639583940868, + "language_loss": 0.82428396, + "learning_rate": 5.987896741296909e-07, + "loss": 0.85160935, + "num_input_tokens_seen": 134910505, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.28320312, + "step": 6276, + "time_per_iteration": 3.0540835857391357 + }, + { + "auxiliary_loss_clip": 0.01486862, + "auxiliary_loss_mlp": 0.01254982, + "balance_loss_clip": 1.13337684, + "balance_loss_mlp": 1.02648127, + "epoch": 0.754764624541574, + "flos": 23698433518560.0, + "grad_norm": 18.66939807933433, + "language_loss": 0.78233671, + "learning_rate": 5.982339458593361e-07, + "loss": 0.80975509, + "num_input_tokens_seen": 134930445, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 2.28125, + "step": 6277, + "time_per_iteration": 2.9893558025360107 + }, + { + "auxiliary_loss_clip": 0.01481257, + "auxiliary_loss_mlp": 0.01252965, + "balance_loss_clip": 1.12608755, + "balance_loss_mlp": 1.02389216, + "epoch": 0.7548848674322131, + "flos": 25339525539840.0, + "grad_norm": 1.5971802020251573, + "language_loss": 0.83984745, + "learning_rate": 5.976784302357767e-07, + "loss": 0.86718971, + "num_input_tokens_seen": 134951010, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.2890625, + "step": 6278, + "time_per_iteration": 3.0158987045288086 + }, + { + "auxiliary_loss_clip": 0.014834, + "auxiliary_loss_mlp": 0.01252215, + "balance_loss_clip": 1.12903452, + "balance_loss_mlp": 1.0214262, + "epoch": 0.7550051103228521, + "flos": 19575184148640.0, + "grad_norm": 2.0391671562207314, + "language_loss": 0.73939204, + "learning_rate": 5.971231273432855e-07, + "loss": 0.76674819, + "num_input_tokens_seen": 134970495, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.30273438, + "step": 6279, + "time_per_iteration": 2.9753549098968506 + }, + { + "auxiliary_loss_clip": 0.01441156, + "auxiliary_loss_mlp": 0.01192291, + "balance_loss_clip": 1.10374653, + "balance_loss_mlp": 1.00231934, + "epoch": 0.7551253532134913, + "flos": 64156160678400.0, + "grad_norm": 0.8261405055687364, + "language_loss": 0.54546642, + "learning_rate": 5.965680372661e-07, + "loss": 0.57180095, + "num_input_tokens_seen": 135028060, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.8984375, + "step": 6280, + "time_per_iteration": 3.3209190368652344 + }, + { + "auxiliary_loss_clip": 0.01487152, + "auxiliary_loss_mlp": 0.01254155, + "balance_loss_clip": 1.13461053, + "balance_loss_mlp": 1.02489209, + "epoch": 0.7552455961041303, + "flos": 26070408469440.0, + "grad_norm": 2.0046828897370927, + "language_loss": 0.5653975, + "learning_rate": 5.960131600884266e-07, + "loss": 0.59281063, + "num_input_tokens_seen": 135047330, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 2.29101562, + "step": 6281, + "time_per_iteration": 3.0308048725128174 + }, + { + "auxiliary_loss_clip": 0.01478736, + "auxiliary_loss_mlp": 0.01251279, + "balance_loss_clip": 1.12354541, + "balance_loss_mlp": 1.02182543, + "epoch": 0.7553658389947694, + "flos": 24500508333120.0, + "grad_norm": 1.8172096155683959, + "language_loss": 0.76123965, + "learning_rate": 5.954584958944413e-07, + "loss": 0.78853983, + "num_input_tokens_seen": 135065995, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.29296875, + "step": 6282, + "time_per_iteration": 2.919898509979248 + }, + { + "auxiliary_loss_clip": 0.01479818, + "auxiliary_loss_mlp": 0.01266235, + "balance_loss_clip": 1.12463629, + "balance_loss_mlp": 1.03697205, + "epoch": 0.7554860818854086, + "flos": 21801665220480.0, + "grad_norm": 3.6776234656583737, + "language_loss": 0.81877017, + "learning_rate": 5.949040447682854e-07, + "loss": 0.84623075, + "num_input_tokens_seen": 135085820, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.29101562, + "step": 6283, + "time_per_iteration": 3.0031166076660156 + }, + { + "auxiliary_loss_clip": 0.01481258, + "auxiliary_loss_mlp": 0.01263671, + "balance_loss_clip": 1.12498069, + "balance_loss_mlp": 1.03192806, + "epoch": 0.7556063247760476, + "flos": 16363964422080.0, + "grad_norm": 4.677895195324493, + "language_loss": 0.68668616, + "learning_rate": 5.943498067940686e-07, + "loss": 0.71413541, + "num_input_tokens_seen": 135102845, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.3125, + "step": 6284, + "time_per_iteration": 3.760735273361206 + }, + { + "auxiliary_loss_clip": 0.01485335, + "auxiliary_loss_mlp": 0.01254638, + "balance_loss_clip": 1.13159645, + "balance_loss_mlp": 1.02556539, + "epoch": 0.7557265676666867, + "flos": 27237621323520.0, + "grad_norm": 1.936714405055242, + "language_loss": 0.81697935, + "learning_rate": 5.937957820558686e-07, + "loss": 0.84437913, + "num_input_tokens_seen": 135122190, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.28710938, + "step": 6285, + "time_per_iteration": 2.97971248626709 + }, + { + "auxiliary_loss_clip": 0.01440976, + "auxiliary_loss_mlp": 0.01196167, + "balance_loss_clip": 1.10334706, + "balance_loss_mlp": 1.00619507, + "epoch": 0.7558468105573258, + "flos": 62195634414720.0, + "grad_norm": 0.8595969935371215, + "language_loss": 0.65225589, + "learning_rate": 5.932419706377296e-07, + "loss": 0.67862737, + "num_input_tokens_seen": 135180495, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.8984375, + "step": 6286, + "time_per_iteration": 3.4436187744140625 + }, + { + "auxiliary_loss_clip": 0.01482365, + "auxiliary_loss_mlp": 0.01255555, + "balance_loss_clip": 1.12687063, + "balance_loss_mlp": 1.02419329, + "epoch": 0.7559670534479649, + "flos": 33251532557760.0, + "grad_norm": 2.6674626485997623, + "language_loss": 0.7416265, + "learning_rate": 5.92688372623666e-07, + "loss": 0.76900578, + "num_input_tokens_seen": 135199200, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.30859375, + "step": 6287, + "time_per_iteration": 3.092400312423706 + }, + { + "auxiliary_loss_clip": 0.01479359, + "auxiliary_loss_mlp": 0.01265204, + "balance_loss_clip": 1.12475085, + "balance_loss_mlp": 1.03288925, + "epoch": 0.7560872963386039, + "flos": 14066139752640.0, + "grad_norm": 2.5049287980767447, + "language_loss": 0.73716676, + "learning_rate": 5.921349880976574e-07, + "loss": 0.76461238, + "num_input_tokens_seen": 135217035, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.31835938, + "step": 6288, + "time_per_iteration": 2.98252534866333 + }, + { + "auxiliary_loss_clip": 0.01476639, + "auxiliary_loss_mlp": 0.01261016, + "balance_loss_clip": 1.12200499, + "balance_loss_mlp": 1.03232455, + "epoch": 0.7562075392292431, + "flos": 20414390996160.0, + "grad_norm": 1.6577090702865425, + "language_loss": 0.82164276, + "learning_rate": 5.915818171436515e-07, + "loss": 0.84901935, + "num_input_tokens_seen": 135236370, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.28710938, + "step": 6289, + "time_per_iteration": 3.0307092666625977 + }, + { + "auxiliary_loss_clip": 0.01480111, + "auxiliary_loss_mlp": 0.01265132, + "balance_loss_clip": 1.12609982, + "balance_loss_mlp": 1.03605962, + "epoch": 0.7563277821198822, + "flos": 20378927802240.0, + "grad_norm": 1.9541471278583415, + "language_loss": 0.74995291, + "learning_rate": 5.910288598455642e-07, + "loss": 0.77740532, + "num_input_tokens_seen": 135255720, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.29296875, + "step": 6290, + "time_per_iteration": 3.1650278568267822 + }, + { + "auxiliary_loss_clip": 0.01487009, + "auxiliary_loss_mlp": 0.01260391, + "balance_loss_clip": 1.13212073, + "balance_loss_mlp": 1.02654958, + "epoch": 0.7564480250105212, + "flos": 18590255853120.0, + "grad_norm": 2.5995627385029585, + "language_loss": 0.74413496, + "learning_rate": 5.90476116287278e-07, + "loss": 0.77160895, + "num_input_tokens_seen": 135273320, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.3359375, + "step": 6291, + "time_per_iteration": 2.9922029972076416 + }, + { + "auxiliary_loss_clip": 0.01481473, + "auxiliary_loss_mlp": 0.01264825, + "balance_loss_clip": 1.12657225, + "balance_loss_mlp": 1.03346407, + "epoch": 0.7565682679011604, + "flos": 21217679511840.0, + "grad_norm": 1.7740456130275377, + "language_loss": 0.68079746, + "learning_rate": 5.899235865526456e-07, + "loss": 0.70826042, + "num_input_tokens_seen": 135292615, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.31054688, + "step": 6292, + "time_per_iteration": 2.9599802494049072 + }, + { + "auxiliary_loss_clip": 0.014848, + "auxiliary_loss_mlp": 0.01255671, + "balance_loss_clip": 1.13094115, + "balance_loss_mlp": 1.02602661, + "epoch": 0.7566885107917994, + "flos": 20451105819360.0, + "grad_norm": 1.7997121098668998, + "language_loss": 0.82397699, + "learning_rate": 5.893712707254825e-07, + "loss": 0.85138178, + "num_input_tokens_seen": 135310075, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.29296875, + "step": 6293, + "time_per_iteration": 2.9325177669525146 + }, + { + "auxiliary_loss_clip": 0.01483034, + "auxiliary_loss_mlp": 0.01267358, + "balance_loss_clip": 1.12839603, + "balance_loss_mlp": 1.03256345, + "epoch": 0.7568087536824385, + "flos": 19027837406880.0, + "grad_norm": 2.8115520570062933, + "language_loss": 0.66399789, + "learning_rate": 5.888191688895769e-07, + "loss": 0.69150192, + "num_input_tokens_seen": 135327335, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.34375, + "step": 6294, + "time_per_iteration": 3.0969839096069336 + }, + { + "auxiliary_loss_clip": 0.01478762, + "auxiliary_loss_mlp": 0.01262803, + "balance_loss_clip": 1.12288737, + "balance_loss_mlp": 1.03048825, + "epoch": 0.7569289965730777, + "flos": 15226904819520.0, + "grad_norm": 2.4059154558338682, + "language_loss": 0.618563, + "learning_rate": 5.882672811286813e-07, + "loss": 0.64597863, + "num_input_tokens_seen": 135343615, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.32226562, + "step": 6295, + "time_per_iteration": 3.000464677810669 + }, + { + "auxiliary_loss_clip": 0.01480868, + "auxiliary_loss_mlp": 0.01269048, + "balance_loss_clip": 1.12408829, + "balance_loss_mlp": 1.03539813, + "epoch": 0.7570492394637167, + "flos": 20771033127840.0, + "grad_norm": 3.2351747689593786, + "language_loss": 0.69459528, + "learning_rate": 5.877156075265166e-07, + "loss": 0.72209442, + "num_input_tokens_seen": 135359880, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.33007812, + "step": 6296, + "time_per_iteration": 2.9832189083099365 + }, + { + "auxiliary_loss_clip": 0.01477622, + "auxiliary_loss_mlp": 0.01259244, + "balance_loss_clip": 1.1222266, + "balance_loss_mlp": 1.02902722, + "epoch": 0.7571694823543558, + "flos": 15665624218080.0, + "grad_norm": 6.668500534763926, + "language_loss": 0.69353509, + "learning_rate": 5.871641481667715e-07, + "loss": 0.72090375, + "num_input_tokens_seen": 135374325, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.296875, + "step": 6297, + "time_per_iteration": 3.8596651554107666 + }, + { + "auxiliary_loss_clip": 0.01480124, + "auxiliary_loss_mlp": 0.01267868, + "balance_loss_clip": 1.12489808, + "balance_loss_mlp": 1.03612518, + "epoch": 0.7572897252449949, + "flos": 25411513916160.0, + "grad_norm": 1.6548141928134346, + "language_loss": 0.84453195, + "learning_rate": 5.866129031331011e-07, + "loss": 0.87201184, + "num_input_tokens_seen": 135393980, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.31054688, + "step": 6298, + "time_per_iteration": 3.072293758392334 + }, + { + "auxiliary_loss_clip": 0.01478711, + "auxiliary_loss_mlp": 0.01262267, + "balance_loss_clip": 1.12315989, + "balance_loss_mlp": 1.03262258, + "epoch": 0.757409968135634, + "flos": 24281622735840.0, + "grad_norm": 2.1414160879764457, + "language_loss": 0.83561796, + "learning_rate": 5.8606187250913e-07, + "loss": 0.86302769, + "num_input_tokens_seen": 135412030, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.29296875, + "step": 6299, + "time_per_iteration": 3.9628210067749023 + }, + { + "auxiliary_loss_clip": 0.01485899, + "auxiliary_loss_mlp": 0.01266045, + "balance_loss_clip": 1.13180828, + "balance_loss_mlp": 1.03525555, + "epoch": 0.757530211026273, + "flos": 24136053000480.0, + "grad_norm": 2.4860314950743456, + "language_loss": 0.83915281, + "learning_rate": 5.855110563784482e-07, + "loss": 0.86667228, + "num_input_tokens_seen": 135430565, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.30273438, + "step": 6300, + "time_per_iteration": 3.0455262660980225 + }, + { + "auxiliary_loss_clip": 0.01476613, + "auxiliary_loss_mlp": 0.01252319, + "balance_loss_clip": 1.12287545, + "balance_loss_mlp": 1.02458191, + "epoch": 0.7576504539169122, + "flos": 23954109795360.0, + "grad_norm": 2.05611699673202, + "language_loss": 0.64062595, + "learning_rate": 5.849604548246156e-07, + "loss": 0.66791534, + "num_input_tokens_seen": 135451675, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.27539062, + "step": 6301, + "time_per_iteration": 3.0107784271240234 + }, + { + "auxiliary_loss_clip": 0.01485227, + "auxiliary_loss_mlp": 0.01257158, + "balance_loss_clip": 1.13054705, + "balance_loss_mlp": 1.02617836, + "epoch": 0.7577706968075513, + "flos": 21253484059200.0, + "grad_norm": 2.209871669717351, + "language_loss": 0.80656981, + "learning_rate": 5.844100679311565e-07, + "loss": 0.83399367, + "num_input_tokens_seen": 135470635, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.30859375, + "step": 6302, + "time_per_iteration": 2.963498592376709 + }, + { + "auxiliary_loss_clip": 0.0148269, + "auxiliary_loss_mlp": 0.01262691, + "balance_loss_clip": 1.12859285, + "balance_loss_mlp": 1.02942204, + "epoch": 0.7578909396981903, + "flos": 18298585388160.0, + "grad_norm": 2.07412939497997, + "language_loss": 0.76114303, + "learning_rate": 5.838598957815637e-07, + "loss": 0.78859687, + "num_input_tokens_seen": 135487865, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.32617188, + "step": 6303, + "time_per_iteration": 3.9162802696228027 + }, + { + "auxiliary_loss_clip": 0.01477604, + "auxiliary_loss_mlp": 0.01255947, + "balance_loss_clip": 1.12159348, + "balance_loss_mlp": 1.02515757, + "epoch": 0.7580111825888295, + "flos": 25376202434880.0, + "grad_norm": 1.6705608058960497, + "language_loss": 0.85606015, + "learning_rate": 5.833099384592996e-07, + "loss": 0.88339561, + "num_input_tokens_seen": 135508440, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.30273438, + "step": 6304, + "time_per_iteration": 3.022216320037842 + }, + { + "auxiliary_loss_clip": 0.01478569, + "auxiliary_loss_mlp": 0.01261824, + "balance_loss_clip": 1.12229705, + "balance_loss_mlp": 1.03179777, + "epoch": 0.7581314254794685, + "flos": 23770914960960.0, + "grad_norm": 2.404095657893212, + "language_loss": 0.71686858, + "learning_rate": 5.827601960477913e-07, + "loss": 0.74427247, + "num_input_tokens_seen": 135526365, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.29882812, + "step": 6305, + "time_per_iteration": 2.9716007709503174 + }, + { + "auxiliary_loss_clip": 0.01482575, + "auxiliary_loss_mlp": 0.0126221, + "balance_loss_clip": 1.12794209, + "balance_loss_mlp": 1.03313756, + "epoch": 0.7582516683701076, + "flos": 22056810503040.0, + "grad_norm": 2.2504431325244383, + "language_loss": 0.7091167, + "learning_rate": 5.822106686304344e-07, + "loss": 0.73656452, + "num_input_tokens_seen": 135545655, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.28710938, + "step": 6306, + "time_per_iteration": 2.987217664718628 + }, + { + "auxiliary_loss_clip": 0.01481436, + "auxiliary_loss_mlp": 0.01264993, + "balance_loss_clip": 1.12611592, + "balance_loss_mlp": 1.03401256, + "epoch": 0.7583719112607467, + "flos": 31652427373920.0, + "grad_norm": 1.7401392022562294, + "language_loss": 0.57945961, + "learning_rate": 5.816613562905919e-07, + "loss": 0.60692388, + "num_input_tokens_seen": 135566840, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.30664062, + "step": 6307, + "time_per_iteration": 3.0049304962158203 + }, + { + "auxiliary_loss_clip": 0.01484106, + "auxiliary_loss_mlp": 0.01253868, + "balance_loss_clip": 1.12994719, + "balance_loss_mlp": 1.02574968, + "epoch": 0.7584921541513858, + "flos": 33070916838240.0, + "grad_norm": 1.740153723285525, + "language_loss": 0.70093459, + "learning_rate": 5.811122591115933e-07, + "loss": 0.72831434, + "num_input_tokens_seen": 135587825, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.27929688, + "step": 6308, + "time_per_iteration": 3.0387754440307617 + }, + { + "auxiliary_loss_clip": 0.01484123, + "auxiliary_loss_mlp": 0.01261848, + "balance_loss_clip": 1.12846589, + "balance_loss_mlp": 1.02877045, + "epoch": 0.7586123970420249, + "flos": 23328326890080.0, + "grad_norm": 2.59147171417365, + "language_loss": 0.71492195, + "learning_rate": 5.805633771767376e-07, + "loss": 0.74238157, + "num_input_tokens_seen": 135605220, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.328125, + "step": 6309, + "time_per_iteration": 2.979515790939331 + }, + { + "auxiliary_loss_clip": 0.01484451, + "auxiliary_loss_mlp": 0.01268804, + "balance_loss_clip": 1.12954783, + "balance_loss_mlp": 1.03877735, + "epoch": 0.7587326399326639, + "flos": 18336286343520.0, + "grad_norm": 4.004465243818702, + "language_loss": 0.77711493, + "learning_rate": 5.800147105692888e-07, + "loss": 0.80464751, + "num_input_tokens_seen": 135624795, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.296875, + "step": 6310, + "time_per_iteration": 3.0472304821014404 + }, + { + "auxiliary_loss_clip": 0.01478953, + "auxiliary_loss_mlp": 0.01252749, + "balance_loss_clip": 1.1235224, + "balance_loss_mlp": 1.02462995, + "epoch": 0.7588528828233031, + "flos": 17277018125760.0, + "grad_norm": 2.5132757147552915, + "language_loss": 0.79124427, + "learning_rate": 5.794662593724795e-07, + "loss": 0.81856126, + "num_input_tokens_seen": 135643800, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.27929688, + "step": 6311, + "time_per_iteration": 3.9037230014801025 + }, + { + "auxiliary_loss_clip": 0.01484537, + "auxiliary_loss_mlp": 0.01264658, + "balance_loss_clip": 1.12912023, + "balance_loss_mlp": 1.03272438, + "epoch": 0.7589731257139422, + "flos": 17715623739840.0, + "grad_norm": 2.6190572086651316, + "language_loss": 0.75259686, + "learning_rate": 5.789180236695091e-07, + "loss": 0.78008878, + "num_input_tokens_seen": 135660655, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.31640625, + "step": 6312, + "time_per_iteration": 3.1525750160217285 + }, + { + "auxiliary_loss_clip": 0.01478305, + "auxiliary_loss_mlp": 0.01253099, + "balance_loss_clip": 1.12261796, + "balance_loss_mlp": 1.02459836, + "epoch": 0.7590933686045812, + "flos": 15962908050720.0, + "grad_norm": 2.1181117357779464, + "language_loss": 0.85603327, + "learning_rate": 5.78370003543544e-07, + "loss": 0.88334733, + "num_input_tokens_seen": 135679410, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.28320312, + "step": 6313, + "time_per_iteration": 3.1183888912200928 + }, + { + "auxiliary_loss_clip": 0.01480469, + "auxiliary_loss_mlp": 0.01265292, + "balance_loss_clip": 1.1249373, + "balance_loss_mlp": 1.03240514, + "epoch": 0.7592136114952204, + "flos": 21070137512160.0, + "grad_norm": 2.0657328757387816, + "language_loss": 0.8409974, + "learning_rate": 5.778221990777203e-07, + "loss": 0.86845499, + "num_input_tokens_seen": 135697150, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.32421875, + "step": 6314, + "time_per_iteration": 2.9990861415863037 + }, + { + "auxiliary_loss_clip": 0.0148356, + "auxiliary_loss_mlp": 0.01256904, + "balance_loss_clip": 1.12930501, + "balance_loss_mlp": 1.02516103, + "epoch": 0.7593338543858594, + "flos": 25299814392000.0, + "grad_norm": 2.4627624468450615, + "language_loss": 0.82905936, + "learning_rate": 5.772746103551372e-07, + "loss": 0.85646403, + "num_input_tokens_seen": 135712545, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.3125, + "step": 6315, + "time_per_iteration": 3.079373836517334 + }, + { + "auxiliary_loss_clip": 0.01479715, + "auxiliary_loss_mlp": 0.01254212, + "balance_loss_clip": 1.12423587, + "balance_loss_mlp": 1.02571177, + "epoch": 0.7594540972764985, + "flos": 31835091214080.0, + "grad_norm": 4.236267270487764, + "language_loss": 0.71784186, + "learning_rate": 5.767272374588648e-07, + "loss": 0.7451812, + "num_input_tokens_seen": 135733950, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.28125, + "step": 6316, + "time_per_iteration": 3.0437073707580566 + }, + { + "auxiliary_loss_clip": 0.01484787, + "auxiliary_loss_mlp": 0.01268412, + "balance_loss_clip": 1.13017213, + "balance_loss_mlp": 1.03838563, + "epoch": 0.7595743401671377, + "flos": 37600304952960.0, + "grad_norm": 2.150345468935072, + "language_loss": 0.78077585, + "learning_rate": 5.76180080471939e-07, + "loss": 0.80830789, + "num_input_tokens_seen": 135757120, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.29492188, + "step": 6317, + "time_per_iteration": 3.096306562423706 + }, + { + "auxiliary_loss_clip": 0.01487605, + "auxiliary_loss_mlp": 0.01266933, + "balance_loss_clip": 1.13322318, + "balance_loss_mlp": 1.03366399, + "epoch": 0.7596945830577767, + "flos": 18289217132640.0, + "grad_norm": 1.9861051511504804, + "language_loss": 0.72367984, + "learning_rate": 5.756331394773631e-07, + "loss": 0.75122523, + "num_input_tokens_seen": 135773335, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.328125, + "step": 6318, + "time_per_iteration": 2.9480221271514893 + }, + { + "auxiliary_loss_clip": 0.01473417, + "auxiliary_loss_mlp": 0.01267617, + "balance_loss_clip": 1.11783075, + "balance_loss_mlp": 1.03778195, + "epoch": 0.7598148259484158, + "flos": 22235112604800.0, + "grad_norm": 1.9960698242964023, + "language_loss": 0.75922513, + "learning_rate": 5.750864145581071e-07, + "loss": 0.78663552, + "num_input_tokens_seen": 135792555, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.29296875, + "step": 6319, + "time_per_iteration": 2.9719300270080566 + }, + { + "auxiliary_loss_clip": 0.01479088, + "auxiliary_loss_mlp": 0.01256963, + "balance_loss_clip": 1.12403011, + "balance_loss_mlp": 1.02769971, + "epoch": 0.7599350688390549, + "flos": 27164153748960.0, + "grad_norm": 2.058583338699725, + "language_loss": 0.86278445, + "learning_rate": 5.745399057971085e-07, + "loss": 0.89014494, + "num_input_tokens_seen": 135813690, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.29101562, + "step": 6320, + "time_per_iteration": 3.0934274196624756 + }, + { + "auxiliary_loss_clip": 0.01483009, + "auxiliary_loss_mlp": 0.01261258, + "balance_loss_clip": 1.12744224, + "balance_loss_mlp": 1.03085029, + "epoch": 0.760055311729694, + "flos": 15562572314400.0, + "grad_norm": 4.006446212990472, + "language_loss": 0.75655937, + "learning_rate": 5.739936132772738e-07, + "loss": 0.78400201, + "num_input_tokens_seen": 135832255, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.30273438, + "step": 6321, + "time_per_iteration": 2.95725679397583 + }, + { + "auxiliary_loss_clip": 0.01477202, + "auxiliary_loss_mlp": 0.01265397, + "balance_loss_clip": 1.12103307, + "balance_loss_mlp": 1.03441656, + "epoch": 0.760175554620333, + "flos": 25157847831840.0, + "grad_norm": 2.1538326424556162, + "language_loss": 0.74623644, + "learning_rate": 5.734475370814733e-07, + "loss": 0.77366233, + "num_input_tokens_seen": 135851935, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.30664062, + "step": 6322, + "time_per_iteration": 3.0091121196746826 + }, + { + "auxiliary_loss_clip": 0.01478596, + "auxiliary_loss_mlp": 0.01263446, + "balance_loss_clip": 1.12298489, + "balance_loss_mlp": 1.03342021, + "epoch": 0.7602957975109722, + "flos": 24356152298880.0, + "grad_norm": 1.7686127376058471, + "language_loss": 0.78509301, + "learning_rate": 5.729016772925483e-07, + "loss": 0.81251341, + "num_input_tokens_seen": 135873510, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.296875, + "step": 6323, + "time_per_iteration": 3.04779052734375 + }, + { + "auxiliary_loss_clip": 0.01484165, + "auxiliary_loss_mlp": 0.01261106, + "balance_loss_clip": 1.12926698, + "balance_loss_mlp": 1.02936292, + "epoch": 0.7604160404016113, + "flos": 25195055721120.0, + "grad_norm": 3.3736000704030142, + "language_loss": 0.71056122, + "learning_rate": 5.723560339933038e-07, + "loss": 0.73801392, + "num_input_tokens_seen": 135893845, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.31445312, + "step": 6324, + "time_per_iteration": 3.925180673599243 + }, + { + "auxiliary_loss_clip": 0.01482047, + "auxiliary_loss_mlp": 0.01257685, + "balance_loss_clip": 1.12728322, + "balance_loss_mlp": 1.02651405, + "epoch": 0.7605362832922503, + "flos": 29864438131680.0, + "grad_norm": 2.2815479115372366, + "language_loss": 0.65385067, + "learning_rate": 5.71810607266513e-07, + "loss": 0.68124795, + "num_input_tokens_seen": 135912430, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.30664062, + "step": 6325, + "time_per_iteration": 3.113229274749756 + }, + { + "auxiliary_loss_clip": 0.01478155, + "auxiliary_loss_mlp": 0.01261208, + "balance_loss_clip": 1.12129688, + "balance_loss_mlp": 1.03442466, + "epoch": 0.7606565261828895, + "flos": 13919659741440.0, + "grad_norm": 2.14053307292297, + "language_loss": 0.60739374, + "learning_rate": 5.712653971949184e-07, + "loss": 0.63478738, + "num_input_tokens_seen": 135930550, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.265625, + "step": 6326, + "time_per_iteration": 3.8881993293762207 + }, + { + "auxiliary_loss_clip": 0.0148153, + "auxiliary_loss_mlp": 0.01269819, + "balance_loss_clip": 1.1261766, + "balance_loss_mlp": 1.03941154, + "epoch": 0.7607767690735285, + "flos": 18553237604640.0, + "grad_norm": 2.631436214337121, + "language_loss": 0.75517035, + "learning_rate": 5.707204038612268e-07, + "loss": 0.78268385, + "num_input_tokens_seen": 135947980, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.30273438, + "step": 6327, + "time_per_iteration": 3.134650230407715 + }, + { + "auxiliary_loss_clip": 0.0148977, + "auxiliary_loss_mlp": 0.01279944, + "balance_loss_clip": 1.13524544, + "balance_loss_mlp": 1.04209781, + "epoch": 0.7608970119641676, + "flos": 20925098771040.0, + "grad_norm": 54.69194081582838, + "language_loss": 0.73538405, + "learning_rate": 5.701756273481138e-07, + "loss": 0.76308119, + "num_input_tokens_seen": 135965400, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 2.37304688, + "step": 6328, + "time_per_iteration": 3.0619237422943115 + }, + { + "auxiliary_loss_clip": 0.0148113, + "auxiliary_loss_mlp": 0.01253743, + "balance_loss_clip": 1.12651539, + "balance_loss_mlp": 1.02333522, + "epoch": 0.7610172548548068, + "flos": 23809677904800.0, + "grad_norm": 1.7520718088699951, + "language_loss": 0.741606, + "learning_rate": 5.696310677382212e-07, + "loss": 0.76895475, + "num_input_tokens_seen": 135986795, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.29882812, + "step": 6329, + "time_per_iteration": 3.1037893295288086 + }, + { + "auxiliary_loss_clip": 0.01437594, + "auxiliary_loss_mlp": 0.01189629, + "balance_loss_clip": 1.0993154, + "balance_loss_mlp": 1.00080109, + "epoch": 0.7611374977454458, + "flos": 66503444032800.0, + "grad_norm": 0.8816966050650669, + "language_loss": 0.61714721, + "learning_rate": 5.690867251141576e-07, + "loss": 0.64341938, + "num_input_tokens_seen": 136053450, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.88671875, + "step": 6330, + "time_per_iteration": 4.362483501434326 + }, + { + "auxiliary_loss_clip": 0.01475534, + "auxiliary_loss_mlp": 0.01263468, + "balance_loss_clip": 1.11879277, + "balance_loss_mlp": 1.02981758, + "epoch": 0.7612577406360849, + "flos": 15634977900480.0, + "grad_norm": 2.5575968486792098, + "language_loss": 0.91241801, + "learning_rate": 5.685425995585013e-07, + "loss": 0.93980807, + "num_input_tokens_seen": 136071375, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.33203125, + "step": 6331, + "time_per_iteration": 2.9849464893341064 + }, + { + "auxiliary_loss_clip": 0.01438149, + "auxiliary_loss_mlp": 0.01189423, + "balance_loss_clip": 1.09946728, + "balance_loss_mlp": 1.00021362, + "epoch": 0.761377983526724, + "flos": 60533529828480.0, + "grad_norm": 0.7530791872677984, + "language_loss": 0.58959699, + "learning_rate": 5.679986911537935e-07, + "loss": 0.61587274, + "num_input_tokens_seen": 136138905, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.890625, + "step": 6332, + "time_per_iteration": 3.476595640182495 + }, + { + "auxiliary_loss_clip": 0.01477466, + "auxiliary_loss_mlp": 0.01257474, + "balance_loss_clip": 1.12069333, + "balance_loss_mlp": 1.02859235, + "epoch": 0.7614982264173631, + "flos": 35775449174880.0, + "grad_norm": 1.9269576785629157, + "language_loss": 0.67203927, + "learning_rate": 5.674549999825462e-07, + "loss": 0.69938868, + "num_input_tokens_seen": 136161720, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.2890625, + "step": 6333, + "time_per_iteration": 3.0915629863739014 + }, + { + "auxiliary_loss_clip": 0.01437615, + "auxiliary_loss_mlp": 0.01189461, + "balance_loss_clip": 1.09886312, + "balance_loss_mlp": 1.00101471, + "epoch": 0.7616184693080021, + "flos": 67932780950880.0, + "grad_norm": 0.9129868971405992, + "language_loss": 0.71421021, + "learning_rate": 5.669115261272363e-07, + "loss": 0.74048096, + "num_input_tokens_seen": 136222040, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.8828125, + "step": 6334, + "time_per_iteration": 3.3687968254089355 + }, + { + "auxiliary_loss_clip": 0.0148172, + "auxiliary_loss_mlp": 0.01262425, + "balance_loss_clip": 1.1262995, + "balance_loss_mlp": 1.03011024, + "epoch": 0.7617387121986413, + "flos": 20524080327840.0, + "grad_norm": 2.96567050075527, + "language_loss": 0.72921979, + "learning_rate": 5.663682696703081e-07, + "loss": 0.7566613, + "num_input_tokens_seen": 136240305, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.31835938, + "step": 6335, + "time_per_iteration": 2.947768449783325 + }, + { + "auxiliary_loss_clip": 0.01479576, + "auxiliary_loss_mlp": 0.01258326, + "balance_loss_clip": 1.12455142, + "balance_loss_mlp": 1.03001595, + "epoch": 0.7618589550892804, + "flos": 18626477610240.0, + "grad_norm": 2.193441039701984, + "language_loss": 0.82436585, + "learning_rate": 5.658252306941746e-07, + "loss": 0.85174489, + "num_input_tokens_seen": 136259625, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.28320312, + "step": 6336, + "time_per_iteration": 3.0791423320770264 + }, + { + "auxiliary_loss_clip": 0.01479165, + "auxiliary_loss_mlp": 0.01261981, + "balance_loss_clip": 1.12264442, + "balance_loss_mlp": 1.03023839, + "epoch": 0.7619791979799194, + "flos": 17455471940160.0, + "grad_norm": 4.143824021453854, + "language_loss": 0.75518048, + "learning_rate": 5.65282409281212e-07, + "loss": 0.78259194, + "num_input_tokens_seen": 136277090, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.31445312, + "step": 6337, + "time_per_iteration": 3.057921886444092 + }, + { + "auxiliary_loss_clip": 0.0148322, + "auxiliary_loss_mlp": 0.01260797, + "balance_loss_clip": 1.12858868, + "balance_loss_mlp": 1.0300076, + "epoch": 0.7620994408705585, + "flos": 14139190117440.0, + "grad_norm": 2.3739341744916675, + "language_loss": 0.69905454, + "learning_rate": 5.64739805513768e-07, + "loss": 0.72649467, + "num_input_tokens_seen": 136294635, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.3046875, + "step": 6338, + "time_per_iteration": 3.012341260910034 + }, + { + "auxiliary_loss_clip": 0.01436697, + "auxiliary_loss_mlp": 0.01195038, + "balance_loss_clip": 1.09789503, + "balance_loss_mlp": 1.00621033, + "epoch": 0.7622196837611976, + "flos": 70714763683200.0, + "grad_norm": 0.7952659217612453, + "language_loss": 0.55675983, + "learning_rate": 5.641974194741541e-07, + "loss": 0.58307719, + "num_input_tokens_seen": 136350320, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.88671875, + "step": 6339, + "time_per_iteration": 4.429515361785889 + }, + { + "auxiliary_loss_clip": 0.01434869, + "auxiliary_loss_mlp": 0.01188766, + "balance_loss_clip": 1.09637117, + "balance_loss_mlp": 1.00260925, + "epoch": 0.7623399266518367, + "flos": 60690061166400.0, + "grad_norm": 0.7594883742422955, + "language_loss": 0.63636643, + "learning_rate": 5.636552512446502e-07, + "loss": 0.66260284, + "num_input_tokens_seen": 136411375, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.859375, + "step": 6340, + "time_per_iteration": 3.333967447280884 + }, + { + "auxiliary_loss_clip": 0.01479737, + "auxiliary_loss_mlp": 0.01262872, + "balance_loss_clip": 1.12420547, + "balance_loss_mlp": 1.03418124, + "epoch": 0.7624601695424758, + "flos": 26471313128160.0, + "grad_norm": 1.7121730867004645, + "language_loss": 0.78256935, + "learning_rate": 5.631133009075027e-07, + "loss": 0.80999541, + "num_input_tokens_seen": 136430560, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.28710938, + "step": 6341, + "time_per_iteration": 3.040994167327881 + }, + { + "auxiliary_loss_clip": 0.0148351, + "auxiliary_loss_mlp": 0.01252735, + "balance_loss_clip": 1.12805629, + "balance_loss_mlp": 1.02404368, + "epoch": 0.7625804124331149, + "flos": 19137792235680.0, + "grad_norm": 1.9413939055088425, + "language_loss": 0.68542731, + "learning_rate": 5.625715685449242e-07, + "loss": 0.71278977, + "num_input_tokens_seen": 136448665, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.28515625, + "step": 6342, + "time_per_iteration": 3.073399305343628 + }, + { + "auxiliary_loss_clip": 0.01487556, + "auxiliary_loss_mlp": 0.01255087, + "balance_loss_clip": 1.13314795, + "balance_loss_mlp": 1.02525187, + "epoch": 0.762700655323754, + "flos": 26215485138720.0, + "grad_norm": 1.774389323898034, + "language_loss": 0.71856463, + "learning_rate": 5.620300542390966e-07, + "loss": 0.74599111, + "num_input_tokens_seen": 136469710, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.296875, + "step": 6343, + "time_per_iteration": 3.1525118350982666 + }, + { + "auxiliary_loss_clip": 0.0148038, + "auxiliary_loss_mlp": 0.01252268, + "balance_loss_clip": 1.12513602, + "balance_loss_mlp": 1.02777338, + "epoch": 0.762820898214393, + "flos": 22384778581440.0, + "grad_norm": 1.9495350499126578, + "language_loss": 0.85511112, + "learning_rate": 5.614887580721659e-07, + "loss": 0.88243759, + "num_input_tokens_seen": 136489855, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.24609375, + "step": 6344, + "time_per_iteration": 3.0269787311553955 + }, + { + "auxiliary_loss_clip": 0.01481852, + "auxiliary_loss_mlp": 0.01261537, + "balance_loss_clip": 1.12567782, + "balance_loss_mlp": 1.02922213, + "epoch": 0.7629411411050322, + "flos": 15702035616000.0, + "grad_norm": 2.549971441557994, + "language_loss": 0.74152327, + "learning_rate": 5.609476801262481e-07, + "loss": 0.76895714, + "num_input_tokens_seen": 136504715, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.31835938, + "step": 6345, + "time_per_iteration": 3.0597915649414062 + }, + { + "auxiliary_loss_clip": 0.01485449, + "auxiliary_loss_mlp": 0.01262911, + "balance_loss_clip": 1.13033247, + "balance_loss_mlp": 1.03307533, + "epoch": 0.7630613839956712, + "flos": 13771966029120.0, + "grad_norm": 3.7161750150611987, + "language_loss": 0.64297998, + "learning_rate": 5.604068204834223e-07, + "loss": 0.67046356, + "num_input_tokens_seen": 136521610, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.296875, + "step": 6346, + "time_per_iteration": 3.0281593799591064 + }, + { + "auxiliary_loss_clip": 0.01476092, + "auxiliary_loss_mlp": 0.01261883, + "balance_loss_clip": 1.1200552, + "balance_loss_mlp": 1.03338242, + "epoch": 0.7631816268863103, + "flos": 14571044519040.0, + "grad_norm": 2.4447584294674543, + "language_loss": 0.77038825, + "learning_rate": 5.598661792257367e-07, + "loss": 0.797768, + "num_input_tokens_seen": 136538655, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.28515625, + "step": 6347, + "time_per_iteration": 3.1645166873931885 + }, + { + "auxiliary_loss_clip": 0.01480988, + "auxiliary_loss_mlp": 0.01254843, + "balance_loss_clip": 1.12625623, + "balance_loss_mlp": 1.02481651, + "epoch": 0.7633018697769495, + "flos": 19064210876640.0, + "grad_norm": 1.9596642743982855, + "language_loss": 0.76141584, + "learning_rate": 5.593257564352071e-07, + "loss": 0.78877413, + "num_input_tokens_seen": 136557095, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.29882812, + "step": 6348, + "time_per_iteration": 3.097785472869873 + }, + { + "auxiliary_loss_clip": 0.01479006, + "auxiliary_loss_mlp": 0.01257721, + "balance_loss_clip": 1.12355804, + "balance_loss_mlp": 1.02845764, + "epoch": 0.7634221126675885, + "flos": 22055065807680.0, + "grad_norm": 1.5552077784275038, + "language_loss": 0.75793028, + "learning_rate": 5.58785552193815e-07, + "loss": 0.78529751, + "num_input_tokens_seen": 136577340, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.29101562, + "step": 6349, + "time_per_iteration": 3.008086919784546 + }, + { + "auxiliary_loss_clip": 0.01481254, + "auxiliary_loss_mlp": 0.01263733, + "balance_loss_clip": 1.1251241, + "balance_loss_mlp": 1.03389788, + "epoch": 0.7635423555582276, + "flos": 29385021453120.0, + "grad_norm": 3.621100882391898, + "language_loss": 0.75514466, + "learning_rate": 5.582455665835086e-07, + "loss": 0.78259456, + "num_input_tokens_seen": 136597635, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.296875, + "step": 6350, + "time_per_iteration": 3.0497777462005615 + }, + { + "auxiliary_loss_clip": 0.01482889, + "auxiliary_loss_mlp": 0.01271572, + "balance_loss_clip": 1.12521553, + "balance_loss_mlp": 1.03849411, + "epoch": 0.7636625984488667, + "flos": 17786815624800.0, + "grad_norm": 5.62458065836777, + "language_loss": 0.73022819, + "learning_rate": 5.577057996862036e-07, + "loss": 0.7577728, + "num_input_tokens_seen": 136615260, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.32421875, + "step": 6351, + "time_per_iteration": 2.9656777381896973 + }, + { + "auxiliary_loss_clip": 0.01485887, + "auxiliary_loss_mlp": 0.01255239, + "balance_loss_clip": 1.13082504, + "balance_loss_mlp": 1.02635765, + "epoch": 0.7637828413395058, + "flos": 23736968893440.0, + "grad_norm": 1.5587455214063264, + "language_loss": 0.76095027, + "learning_rate": 5.571662515837814e-07, + "loss": 0.78836155, + "num_input_tokens_seen": 136637220, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.28515625, + "step": 6352, + "time_per_iteration": 3.806173086166382 + }, + { + "auxiliary_loss_clip": 0.0148438, + "auxiliary_loss_mlp": 0.01252105, + "balance_loss_clip": 1.12922454, + "balance_loss_mlp": 1.02360463, + "epoch": 0.7639030842301449, + "flos": 36286232806080.0, + "grad_norm": 2.501899015285635, + "language_loss": 0.83931601, + "learning_rate": 5.566269223580926e-07, + "loss": 0.8666808, + "num_input_tokens_seen": 136658930, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.28320312, + "step": 6353, + "time_per_iteration": 3.240842580795288 + }, + { + "auxiliary_loss_clip": 0.01480971, + "auxiliary_loss_mlp": 0.01256555, + "balance_loss_clip": 1.12463379, + "balance_loss_mlp": 1.02862704, + "epoch": 0.764023327120784, + "flos": 28880723537280.0, + "grad_norm": 1.6455944740155333, + "language_loss": 0.75354028, + "learning_rate": 5.560878120909511e-07, + "loss": 0.7809155, + "num_input_tokens_seen": 136681530, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.27734375, + "step": 6354, + "time_per_iteration": 3.9743123054504395 + }, + { + "auxiliary_loss_clip": 0.01437158, + "auxiliary_loss_mlp": 0.01190414, + "balance_loss_clip": 1.09841275, + "balance_loss_mlp": 1.00463867, + "epoch": 0.7641435700114231, + "flos": 64796925206880.0, + "grad_norm": 0.8510158480977537, + "language_loss": 0.58559895, + "learning_rate": 5.55548920864141e-07, + "loss": 0.6118747, + "num_input_tokens_seen": 136742185, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.85546875, + "step": 6355, + "time_per_iteration": 3.4723079204559326 + }, + { + "auxiliary_loss_clip": 0.01486743, + "auxiliary_loss_mlp": 0.01255745, + "balance_loss_clip": 1.13170099, + "balance_loss_mlp": 1.02724457, + "epoch": 0.7642638129020621, + "flos": 16837616020320.0, + "grad_norm": 1.6636061513101887, + "language_loss": 0.78159434, + "learning_rate": 5.550102487594113e-07, + "loss": 0.80901927, + "num_input_tokens_seen": 136760855, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.28710938, + "step": 6356, + "time_per_iteration": 3.0072758197784424 + }, + { + "auxiliary_loss_clip": 0.01490217, + "auxiliary_loss_mlp": 0.01255272, + "balance_loss_clip": 1.13474226, + "balance_loss_mlp": 1.02295685, + "epoch": 0.7643840557927013, + "flos": 30411557304480.0, + "grad_norm": 1.638901023578703, + "language_loss": 0.71712661, + "learning_rate": 5.54471795858477e-07, + "loss": 0.74458152, + "num_input_tokens_seen": 136780925, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.31835938, + "step": 6357, + "time_per_iteration": 3.8812201023101807 + }, + { + "auxiliary_loss_clip": 0.0147969, + "auxiliary_loss_mlp": 0.01255922, + "balance_loss_clip": 1.12394834, + "balance_loss_mlp": 1.02627754, + "epoch": 0.7645042986833404, + "flos": 16985158020000.0, + "grad_norm": 2.0545897773227746, + "language_loss": 0.82518184, + "learning_rate": 5.539335622430235e-07, + "loss": 0.85253793, + "num_input_tokens_seen": 136799545, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.29296875, + "step": 6358, + "time_per_iteration": 2.928760528564453 + }, + { + "auxiliary_loss_clip": 0.01477849, + "auxiliary_loss_mlp": 0.0125221, + "balance_loss_clip": 1.12165666, + "balance_loss_mlp": 1.02466321, + "epoch": 0.7646245415739794, + "flos": 17313429523680.0, + "grad_norm": 2.3165415461095953, + "language_loss": 0.74739814, + "learning_rate": 5.533955479946975e-07, + "loss": 0.77469873, + "num_input_tokens_seen": 136818325, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.2734375, + "step": 6359, + "time_per_iteration": 3.002223014831543 + }, + { + "auxiliary_loss_clip": 0.01437547, + "auxiliary_loss_mlp": 0.01190742, + "balance_loss_clip": 1.09844899, + "balance_loss_mlp": 1.00267792, + "epoch": 0.7647447844646186, + "flos": 70409514572640.0, + "grad_norm": 0.8580513229943673, + "language_loss": 0.65705615, + "learning_rate": 5.528577531951173e-07, + "loss": 0.68333906, + "num_input_tokens_seen": 136878730, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.87890625, + "step": 6360, + "time_per_iteration": 3.4101741313934326 + }, + { + "auxiliary_loss_clip": 0.0148838, + "auxiliary_loss_mlp": 0.01267485, + "balance_loss_clip": 1.1332742, + "balance_loss_mlp": 1.0403198, + "epoch": 0.7648650273552576, + "flos": 17677846928160.0, + "grad_norm": 2.177490198989577, + "language_loss": 0.74125361, + "learning_rate": 5.523201779258653e-07, + "loss": 0.7688123, + "num_input_tokens_seen": 136897705, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.26953125, + "step": 6361, + "time_per_iteration": 2.9411370754241943 + }, + { + "auxiliary_loss_clip": 0.01479468, + "auxiliary_loss_mlp": 0.01260276, + "balance_loss_clip": 1.12244737, + "balance_loss_mlp": 1.03272915, + "epoch": 0.7649852702458967, + "flos": 22164110360640.0, + "grad_norm": 2.319975790978082, + "language_loss": 0.84418148, + "learning_rate": 5.517828222684912e-07, + "loss": 0.87157893, + "num_input_tokens_seen": 136918360, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.2734375, + "step": 6362, + "time_per_iteration": 3.0147545337677 + }, + { + "auxiliary_loss_clip": 0.01437121, + "auxiliary_loss_mlp": 0.01192276, + "balance_loss_clip": 1.09831977, + "balance_loss_mlp": 1.0057373, + "epoch": 0.7651055131365359, + "flos": 69855113193120.0, + "grad_norm": 0.8080468225165637, + "language_loss": 0.59037232, + "learning_rate": 5.512456863045117e-07, + "loss": 0.61666632, + "num_input_tokens_seen": 136979050, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.86328125, + "step": 6363, + "time_per_iteration": 3.347445249557495 + }, + { + "auxiliary_loss_clip": 0.01476025, + "auxiliary_loss_mlp": 0.01263002, + "balance_loss_clip": 1.11900628, + "balance_loss_mlp": 1.0325942, + "epoch": 0.7652257560271749, + "flos": 19466063739360.0, + "grad_norm": 1.678863310202522, + "language_loss": 0.74191457, + "learning_rate": 5.507087701154089e-07, + "loss": 0.76930487, + "num_input_tokens_seen": 136998970, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.30078125, + "step": 6364, + "time_per_iteration": 3.0233681201934814 + }, + { + "auxiliary_loss_clip": 0.01484853, + "auxiliary_loss_mlp": 0.01259548, + "balance_loss_clip": 1.12961864, + "balance_loss_mlp": 1.03219187, + "epoch": 0.765345998917814, + "flos": 15962945978880.0, + "grad_norm": 16.571140719375016, + "language_loss": 0.75259638, + "learning_rate": 5.50172073782634e-07, + "loss": 0.78004044, + "num_input_tokens_seen": 137016950, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.27148438, + "step": 6365, + "time_per_iteration": 3.0121169090270996 + }, + { + "auxiliary_loss_clip": 0.01484934, + "auxiliary_loss_mlp": 0.01254856, + "balance_loss_clip": 1.12969947, + "balance_loss_mlp": 1.02711916, + "epoch": 0.7654662418084531, + "flos": 23662135905120.0, + "grad_norm": 2.4136857848022113, + "language_loss": 0.87639582, + "learning_rate": 5.496355973876023e-07, + "loss": 0.90379381, + "num_input_tokens_seen": 137036205, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.27734375, + "step": 6366, + "time_per_iteration": 3.819561719894409 + }, + { + "auxiliary_loss_clip": 0.01483002, + "auxiliary_loss_mlp": 0.01266557, + "balance_loss_clip": 1.12697816, + "balance_loss_mlp": 1.03557742, + "epoch": 0.7655864846990922, + "flos": 41466816057600.0, + "grad_norm": 2.680618292320744, + "language_loss": 0.70895916, + "learning_rate": 5.490993410116984e-07, + "loss": 0.73645478, + "num_input_tokens_seen": 137059195, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.3046875, + "step": 6367, + "time_per_iteration": 3.338472604751587 + }, + { + "auxiliary_loss_clip": 0.01483795, + "auxiliary_loss_mlp": 0.01257431, + "balance_loss_clip": 1.12934446, + "balance_loss_mlp": 1.02759516, + "epoch": 0.7657067275897312, + "flos": 43146026244000.0, + "grad_norm": 1.8829583768260107, + "language_loss": 0.69820595, + "learning_rate": 5.485633047362704e-07, + "loss": 0.72561824, + "num_input_tokens_seen": 137081200, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.29492188, + "step": 6368, + "time_per_iteration": 3.2281925678253174 + }, + { + "auxiliary_loss_clip": 0.01496973, + "auxiliary_loss_mlp": 0.01269158, + "balance_loss_clip": 1.14220309, + "balance_loss_mlp": 1.03817821, + "epoch": 0.7658269704803703, + "flos": 17313732948960.0, + "grad_norm": 2.322470625852312, + "language_loss": 0.78357285, + "learning_rate": 5.480274886426341e-07, + "loss": 0.81123412, + "num_input_tokens_seen": 137097840, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.30664062, + "step": 6369, + "time_per_iteration": 3.0174851417541504 + }, + { + "auxiliary_loss_clip": 0.01480526, + "auxiliary_loss_mlp": 0.01257395, + "balance_loss_clip": 1.12559104, + "balance_loss_mlp": 1.03003883, + "epoch": 0.7659472133710095, + "flos": 12569820975360.0, + "grad_norm": 2.0588560781519383, + "language_loss": 0.78221887, + "learning_rate": 5.474918928120744e-07, + "loss": 0.80959809, + "num_input_tokens_seen": 137114335, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.2734375, + "step": 6370, + "time_per_iteration": 3.1547601222991943 + }, + { + "auxiliary_loss_clip": 0.01477984, + "auxiliary_loss_mlp": 0.01255885, + "balance_loss_clip": 1.12163877, + "balance_loss_mlp": 1.0279572, + "epoch": 0.7660674562616485, + "flos": 22709446909920.0, + "grad_norm": 1.5824886101553257, + "language_loss": 0.87094426, + "learning_rate": 5.469565173258392e-07, + "loss": 0.898283, + "num_input_tokens_seen": 137132850, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.27734375, + "step": 6371, + "time_per_iteration": 3.10199236869812 + }, + { + "auxiliary_loss_clip": 0.01486635, + "auxiliary_loss_mlp": 0.01265755, + "balance_loss_clip": 1.12999475, + "balance_loss_mlp": 1.03496623, + "epoch": 0.7661876991522876, + "flos": 17058398025600.0, + "grad_norm": 1.9313656969644841, + "language_loss": 0.63841462, + "learning_rate": 5.464213622651454e-07, + "loss": 0.6659385, + "num_input_tokens_seen": 137150665, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.30273438, + "step": 6372, + "time_per_iteration": 3.0922889709472656 + }, + { + "auxiliary_loss_clip": 0.01482856, + "auxiliary_loss_mlp": 0.0126515, + "balance_loss_clip": 1.12569928, + "balance_loss_mlp": 1.03397906, + "epoch": 0.7663079420429267, + "flos": 20086309133280.0, + "grad_norm": 2.148729568927681, + "language_loss": 0.84582031, + "learning_rate": 5.458864277111753e-07, + "loss": 0.87330031, + "num_input_tokens_seen": 137168500, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.30664062, + "step": 6373, + "time_per_iteration": 3.0310778617858887 + }, + { + "auxiliary_loss_clip": 0.01479782, + "auxiliary_loss_mlp": 0.01248105, + "balance_loss_clip": 1.12502003, + "balance_loss_mlp": 1.02208447, + "epoch": 0.7664281849335658, + "flos": 12679131025440.0, + "grad_norm": 2.8067348625078594, + "language_loss": 0.69537723, + "learning_rate": 5.453517137450769e-07, + "loss": 0.72265613, + "num_input_tokens_seen": 137185075, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.25976562, + "step": 6374, + "time_per_iteration": 3.059061288833618 + }, + { + "auxiliary_loss_clip": 0.01488598, + "auxiliary_loss_mlp": 0.01258251, + "balance_loss_clip": 1.13438916, + "balance_loss_mlp": 1.02879703, + "epoch": 0.7665484278242048, + "flos": 22347418979520.0, + "grad_norm": 1.735064427895239, + "language_loss": 0.76094913, + "learning_rate": 5.448172204479684e-07, + "loss": 0.78841758, + "num_input_tokens_seen": 137204355, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.29492188, + "step": 6375, + "time_per_iteration": 3.119464159011841 + }, + { + "auxiliary_loss_clip": 0.01475352, + "auxiliary_loss_mlp": 0.01252584, + "balance_loss_clip": 1.11695552, + "balance_loss_mlp": 1.02293932, + "epoch": 0.766668670714844, + "flos": 23619883570560.0, + "grad_norm": 1.7655801276569907, + "language_loss": 0.74678671, + "learning_rate": 5.442829479009294e-07, + "loss": 0.77406603, + "num_input_tokens_seen": 137223135, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.29492188, + "step": 6376, + "time_per_iteration": 3.0707650184631348 + }, + { + "auxiliary_loss_clip": 0.0148697, + "auxiliary_loss_mlp": 0.01267344, + "balance_loss_clip": 1.13289201, + "balance_loss_mlp": 1.03655505, + "epoch": 0.7667889136054831, + "flos": 19429159275360.0, + "grad_norm": 2.8424186881904214, + "language_loss": 0.71861923, + "learning_rate": 5.437488961850103e-07, + "loss": 0.74616241, + "num_input_tokens_seen": 137242935, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.30664062, + "step": 6377, + "time_per_iteration": 3.124109983444214 + }, + { + "auxiliary_loss_clip": 0.01476406, + "auxiliary_loss_mlp": 0.01252338, + "balance_loss_clip": 1.12082338, + "balance_loss_mlp": 1.02460027, + "epoch": 0.7669091564961221, + "flos": 26868576683520.0, + "grad_norm": 2.165033604714502, + "language_loss": 0.75368345, + "learning_rate": 5.432150653812258e-07, + "loss": 0.78097087, + "num_input_tokens_seen": 137262970, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.27929688, + "step": 6378, + "time_per_iteration": 3.0817666053771973 + }, + { + "auxiliary_loss_clip": 0.0148684, + "auxiliary_loss_mlp": 0.01265665, + "balance_loss_clip": 1.13248777, + "balance_loss_mlp": 1.0369736, + "epoch": 0.7670293993867613, + "flos": 12386929566240.0, + "grad_norm": 2.2122329944759738, + "language_loss": 0.82257485, + "learning_rate": 5.42681455570557e-07, + "loss": 0.85009986, + "num_input_tokens_seen": 137279500, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 2.28710938, + "step": 6379, + "time_per_iteration": 3.021345615386963 + }, + { + "auxiliary_loss_clip": 0.01479162, + "auxiliary_loss_mlp": 0.01254963, + "balance_loss_clip": 1.12298417, + "balance_loss_mlp": 1.0262723, + "epoch": 0.7671496422774003, + "flos": 21765140038080.0, + "grad_norm": 2.006381843593961, + "language_loss": 0.64890993, + "learning_rate": 5.42148066833954e-07, + "loss": 0.67625117, + "num_input_tokens_seen": 137298745, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.28710938, + "step": 6380, + "time_per_iteration": 3.8331830501556396 + }, + { + "auxiliary_loss_clip": 0.01486352, + "auxiliary_loss_mlp": 0.01254912, + "balance_loss_clip": 1.13181984, + "balance_loss_mlp": 1.0260303, + "epoch": 0.7672698851680394, + "flos": 21071351213280.0, + "grad_norm": 4.035281000976861, + "language_loss": 0.75177562, + "learning_rate": 5.416148992523289e-07, + "loss": 0.77918833, + "num_input_tokens_seen": 137317320, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.28515625, + "step": 6381, + "time_per_iteration": 3.87141489982605 + }, + { + "auxiliary_loss_clip": 0.01487422, + "auxiliary_loss_mlp": 0.01269908, + "balance_loss_clip": 1.13295221, + "balance_loss_mlp": 1.04102635, + "epoch": 0.7673901280586786, + "flos": 16978255094880.0, + "grad_norm": 3.2138720366904105, + "language_loss": 0.78730655, + "learning_rate": 5.410819529065644e-07, + "loss": 0.81487978, + "num_input_tokens_seen": 137335275, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.28515625, + "step": 6382, + "time_per_iteration": 3.078554153442383 + }, + { + "auxiliary_loss_clip": 0.01485675, + "auxiliary_loss_mlp": 0.01250907, + "balance_loss_clip": 1.13032508, + "balance_loss_mlp": 1.02202523, + "epoch": 0.7675103709493176, + "flos": 29245292654400.0, + "grad_norm": 2.2966822719124522, + "language_loss": 0.65628487, + "learning_rate": 5.405492278775079e-07, + "loss": 0.68365067, + "num_input_tokens_seen": 137355055, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.28710938, + "step": 6383, + "time_per_iteration": 2.987483501434326 + }, + { + "auxiliary_loss_clip": 0.0148804, + "auxiliary_loss_mlp": 0.01262007, + "balance_loss_clip": 1.13345468, + "balance_loss_mlp": 1.0295018, + "epoch": 0.7676306138399567, + "flos": 29025572637600.0, + "grad_norm": 2.3952887651245005, + "language_loss": 0.80010879, + "learning_rate": 5.400167242459732e-07, + "loss": 0.8276093, + "num_input_tokens_seen": 137374015, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.32226562, + "step": 6384, + "time_per_iteration": 3.084599018096924 + }, + { + "auxiliary_loss_clip": 0.01483026, + "auxiliary_loss_mlp": 0.01256289, + "balance_loss_clip": 1.12635696, + "balance_loss_mlp": 1.02836049, + "epoch": 0.7677508567305958, + "flos": 22567328637120.0, + "grad_norm": 1.6578273819805078, + "language_loss": 0.80760479, + "learning_rate": 5.394844420927405e-07, + "loss": 0.83499795, + "num_input_tokens_seen": 137393625, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.27734375, + "step": 6385, + "time_per_iteration": 3.0346338748931885 + }, + { + "auxiliary_loss_clip": 0.01483765, + "auxiliary_loss_mlp": 0.01261887, + "balance_loss_clip": 1.12773705, + "balance_loss_mlp": 1.03510368, + "epoch": 0.7678710996212349, + "flos": 25413827533920.0, + "grad_norm": 2.301435040833337, + "language_loss": 0.73358274, + "learning_rate": 5.389523814985562e-07, + "loss": 0.7610392, + "num_input_tokens_seen": 137413045, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.26757812, + "step": 6386, + "time_per_iteration": 3.8558261394500732 + }, + { + "auxiliary_loss_clip": 0.01484189, + "auxiliary_loss_mlp": 0.01261042, + "balance_loss_clip": 1.12788391, + "balance_loss_mlp": 1.03120685, + "epoch": 0.767991342511874, + "flos": 26759039064480.0, + "grad_norm": 2.053272914155708, + "language_loss": 0.7619108, + "learning_rate": 5.384205425441344e-07, + "loss": 0.78936315, + "num_input_tokens_seen": 137433955, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.29492188, + "step": 6387, + "time_per_iteration": 3.221933364868164 + }, + { + "auxiliary_loss_clip": 0.0147887, + "auxiliary_loss_mlp": 0.01261064, + "balance_loss_clip": 1.12303317, + "balance_loss_mlp": 1.03027463, + "epoch": 0.7681115854025131, + "flos": 26361927221760.0, + "grad_norm": 1.6832379126984156, + "language_loss": 0.84319234, + "learning_rate": 5.378889253101537e-07, + "loss": 0.8705917, + "num_input_tokens_seen": 137454510, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.30273438, + "step": 6388, + "time_per_iteration": 3.2203962802886963 + }, + { + "auxiliary_loss_clip": 0.01478596, + "auxiliary_loss_mlp": 0.0125242, + "balance_loss_clip": 1.12290215, + "balance_loss_mlp": 1.02430153, + "epoch": 0.7682318282931522, + "flos": 23259221053920.0, + "grad_norm": 2.0570807429216877, + "language_loss": 0.80799186, + "learning_rate": 5.373575298772617e-07, + "loss": 0.83530211, + "num_input_tokens_seen": 137473630, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.27929688, + "step": 6389, + "time_per_iteration": 3.162109136581421 + }, + { + "auxiliary_loss_clip": 0.01436577, + "auxiliary_loss_mlp": 0.01200348, + "balance_loss_clip": 1.09776092, + "balance_loss_mlp": 1.01419067, + "epoch": 0.7683520711837912, + "flos": 70079536301760.0, + "grad_norm": 0.7733721740209738, + "language_loss": 0.612782, + "learning_rate": 5.368263563260689e-07, + "loss": 0.63915122, + "num_input_tokens_seen": 137538765, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.859375, + "step": 6390, + "time_per_iteration": 3.6232187747955322 + }, + { + "auxiliary_loss_clip": 0.01483372, + "auxiliary_loss_mlp": 0.01261311, + "balance_loss_clip": 1.12795901, + "balance_loss_mlp": 1.03014028, + "epoch": 0.7684723140744304, + "flos": 18626894820000.0, + "grad_norm": 1.74252344221885, + "language_loss": 0.64285362, + "learning_rate": 5.362954047371537e-07, + "loss": 0.67030048, + "num_input_tokens_seen": 137557875, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.31054688, + "step": 6391, + "time_per_iteration": 3.1902894973754883 + }, + { + "auxiliary_loss_clip": 0.01490675, + "auxiliary_loss_mlp": 0.01266717, + "balance_loss_clip": 1.13411057, + "balance_loss_mlp": 1.03287601, + "epoch": 0.7685925569650695, + "flos": 27455407004160.0, + "grad_norm": 1.8342361629641382, + "language_loss": 0.72306162, + "learning_rate": 5.357646751910627e-07, + "loss": 0.7506355, + "num_input_tokens_seen": 137579055, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.33398438, + "step": 6392, + "time_per_iteration": 3.2025070190429688 + }, + { + "auxiliary_loss_clip": 0.01486805, + "auxiliary_loss_mlp": 0.01261429, + "balance_loss_clip": 1.13117623, + "balance_loss_mlp": 1.03044939, + "epoch": 0.7687127998557085, + "flos": 24538057575840.0, + "grad_norm": 2.6921577845328897, + "language_loss": 0.79759824, + "learning_rate": 5.352341677683061e-07, + "loss": 0.82508063, + "num_input_tokens_seen": 137600355, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.30664062, + "step": 6393, + "time_per_iteration": 3.1427760124206543 + }, + { + "auxiliary_loss_clip": 0.01482061, + "auxiliary_loss_mlp": 0.01259021, + "balance_loss_clip": 1.12640738, + "balance_loss_mlp": 1.02746892, + "epoch": 0.7688330427463477, + "flos": 25158340897920.0, + "grad_norm": 6.18436598027393, + "language_loss": 0.79263425, + "learning_rate": 5.347038825493617e-07, + "loss": 0.82004505, + "num_input_tokens_seen": 137621885, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.3125, + "step": 6394, + "time_per_iteration": 4.164757251739502 + }, + { + "auxiliary_loss_clip": 0.01485739, + "auxiliary_loss_mlp": 0.01254557, + "balance_loss_clip": 1.13204741, + "balance_loss_mlp": 1.02586603, + "epoch": 0.7689532856369867, + "flos": 21213203988960.0, + "grad_norm": 2.1326311885033613, + "language_loss": 0.68531585, + "learning_rate": 5.341738196146732e-07, + "loss": 0.71271884, + "num_input_tokens_seen": 137640230, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.28710938, + "step": 6395, + "time_per_iteration": 3.105958938598633 + }, + { + "auxiliary_loss_clip": 0.01481438, + "auxiliary_loss_mlp": 0.01257566, + "balance_loss_clip": 1.12493551, + "balance_loss_mlp": 1.02773046, + "epoch": 0.7690735285276258, + "flos": 25121436433920.0, + "grad_norm": 8.8131614843087, + "language_loss": 0.73460913, + "learning_rate": 5.336439790446503e-07, + "loss": 0.76199913, + "num_input_tokens_seen": 137659330, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.29492188, + "step": 6396, + "time_per_iteration": 3.0282862186431885 + }, + { + "auxiliary_loss_clip": 0.01486742, + "auxiliary_loss_mlp": 0.01258403, + "balance_loss_clip": 1.1305058, + "balance_loss_mlp": 1.03142893, + "epoch": 0.769193771418265, + "flos": 54747949032000.0, + "grad_norm": 2.061561407944318, + "language_loss": 0.62840021, + "learning_rate": 5.331143609196711e-07, + "loss": 0.65585166, + "num_input_tokens_seen": 137683145, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.27148438, + "step": 6397, + "time_per_iteration": 3.3372976779937744 + }, + { + "auxiliary_loss_clip": 0.01481498, + "auxiliary_loss_mlp": 0.01259134, + "balance_loss_clip": 1.12586546, + "balance_loss_mlp": 1.02700961, + "epoch": 0.769314014308904, + "flos": 37344932101440.0, + "grad_norm": 1.8776840534485502, + "language_loss": 0.77415311, + "learning_rate": 5.325849653200758e-07, + "loss": 0.80155945, + "num_input_tokens_seen": 137707095, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.31835938, + "step": 6398, + "time_per_iteration": 3.1347079277038574 + }, + { + "auxiliary_loss_clip": 0.01480928, + "auxiliary_loss_mlp": 0.01254169, + "balance_loss_clip": 1.12455034, + "balance_loss_mlp": 1.02471471, + "epoch": 0.7694342571995431, + "flos": 20633807587680.0, + "grad_norm": 4.3387213796961674, + "language_loss": 0.7671324, + "learning_rate": 5.32055792326175e-07, + "loss": 0.79448336, + "num_input_tokens_seen": 137725520, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.29296875, + "step": 6399, + "time_per_iteration": 3.22031569480896 + }, + { + "auxiliary_loss_clip": 0.01490451, + "auxiliary_loss_mlp": 0.01260508, + "balance_loss_clip": 1.13516474, + "balance_loss_mlp": 1.03143573, + "epoch": 0.7695545000901821, + "flos": 24210051569280.0, + "grad_norm": 2.8167995227331764, + "language_loss": 0.7272892, + "learning_rate": 5.315268420182437e-07, + "loss": 0.75479877, + "num_input_tokens_seen": 137744195, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.2890625, + "step": 6400, + "time_per_iteration": 3.206824541091919 + }, + { + "auxiliary_loss_clip": 0.01483653, + "auxiliary_loss_mlp": 0.0125174, + "balance_loss_clip": 1.12725616, + "balance_loss_mlp": 1.02133226, + "epoch": 0.7696747429808213, + "flos": 28003777806240.0, + "grad_norm": 1.8706720023537842, + "language_loss": 0.76479363, + "learning_rate": 5.309981144765221e-07, + "loss": 0.79214764, + "num_input_tokens_seen": 137764340, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.30273438, + "step": 6401, + "time_per_iteration": 3.205416202545166 + }, + { + "auxiliary_loss_clip": 0.01482071, + "auxiliary_loss_mlp": 0.01260591, + "balance_loss_clip": 1.12710965, + "balance_loss_mlp": 1.03018343, + "epoch": 0.7697949858714603, + "flos": 11511121680000.0, + "grad_norm": 2.3941206852869543, + "language_loss": 0.75130749, + "learning_rate": 5.304696097812196e-07, + "loss": 0.77873409, + "num_input_tokens_seen": 137780940, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.30078125, + "step": 6402, + "time_per_iteration": 3.147709846496582 + }, + { + "auxiliary_loss_clip": 0.01485888, + "auxiliary_loss_mlp": 0.0127078, + "balance_loss_clip": 1.13062143, + "balance_loss_mlp": 1.03465009, + "epoch": 0.7699152287620994, + "flos": 26690729719680.0, + "grad_norm": 2.953896601379127, + "language_loss": 0.60591245, + "learning_rate": 5.299413280125078e-07, + "loss": 0.63347912, + "num_input_tokens_seen": 137799250, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.35546875, + "step": 6403, + "time_per_iteration": 3.0363945960998535 + }, + { + "auxiliary_loss_clip": 0.01483274, + "auxiliary_loss_mlp": 0.01254927, + "balance_loss_clip": 1.12796736, + "balance_loss_mlp": 1.02127647, + "epoch": 0.7700354716527386, + "flos": 16546817903040.0, + "grad_norm": 1.974056103884322, + "language_loss": 0.73104966, + "learning_rate": 5.294132692505284e-07, + "loss": 0.75843173, + "num_input_tokens_seen": 137817660, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.33203125, + "step": 6404, + "time_per_iteration": 3.1025288105010986 + }, + { + "auxiliary_loss_clip": 0.01479488, + "auxiliary_loss_mlp": 0.01262992, + "balance_loss_clip": 1.12317562, + "balance_loss_mlp": 1.03468215, + "epoch": 0.7701557145433776, + "flos": 19244636955360.0, + "grad_norm": 2.80844131552478, + "language_loss": 0.78648925, + "learning_rate": 5.288854335753861e-07, + "loss": 0.81391406, + "num_input_tokens_seen": 137835920, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.28515625, + "step": 6405, + "time_per_iteration": 3.1110596656799316 + }, + { + "auxiliary_loss_clip": 0.01481103, + "auxiliary_loss_mlp": 0.0125483, + "balance_loss_clip": 1.12476063, + "balance_loss_mlp": 1.02671099, + "epoch": 0.7702759574340167, + "flos": 31688421562080.0, + "grad_norm": 3.003758705354311, + "language_loss": 0.75879657, + "learning_rate": 5.283578210671551e-07, + "loss": 0.78615588, + "num_input_tokens_seen": 137858160, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.28125, + "step": 6406, + "time_per_iteration": 3.035036563873291 + }, + { + "auxiliary_loss_clip": 0.01485401, + "auxiliary_loss_mlp": 0.01262712, + "balance_loss_clip": 1.12965977, + "balance_loss_mlp": 1.03402066, + "epoch": 0.7703962003246558, + "flos": 16801963185600.0, + "grad_norm": 3.3685951775087704, + "language_loss": 0.7677415, + "learning_rate": 5.278304318058719e-07, + "loss": 0.79522264, + "num_input_tokens_seen": 137876015, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.28320312, + "step": 6407, + "time_per_iteration": 3.8666226863861084 + }, + { + "auxiliary_loss_clip": 0.0147819, + "auxiliary_loss_mlp": 0.01264161, + "balance_loss_clip": 1.1218226, + "balance_loss_mlp": 1.03318143, + "epoch": 0.7705164432152949, + "flos": 35738999848800.0, + "grad_norm": 2.0052804112836737, + "language_loss": 0.794025, + "learning_rate": 5.273032658715411e-07, + "loss": 0.82144856, + "num_input_tokens_seen": 137898825, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.30664062, + "step": 6408, + "time_per_iteration": 3.1300623416900635 + }, + { + "auxiliary_loss_clip": 0.01481998, + "auxiliary_loss_mlp": 0.01263133, + "balance_loss_clip": 1.12642241, + "balance_loss_mlp": 1.03386927, + "epoch": 0.7706366861059339, + "flos": 23368113894240.0, + "grad_norm": 1.906726029324992, + "language_loss": 0.7669822, + "learning_rate": 5.267763233441347e-07, + "loss": 0.79443353, + "num_input_tokens_seen": 137919455, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.29101562, + "step": 6409, + "time_per_iteration": 3.8956298828125 + }, + { + "auxiliary_loss_clip": 0.01482909, + "auxiliary_loss_mlp": 0.01259124, + "balance_loss_clip": 1.1267736, + "balance_loss_mlp": 1.02661824, + "epoch": 0.7707569289965731, + "flos": 22932125323200.0, + "grad_norm": 2.683149626618133, + "language_loss": 0.69501042, + "learning_rate": 5.26249604303588e-07, + "loss": 0.72243077, + "num_input_tokens_seen": 137937960, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.3203125, + "step": 6410, + "time_per_iteration": 3.012181282043457 + }, + { + "auxiliary_loss_clip": 0.01484904, + "auxiliary_loss_mlp": 0.01261552, + "balance_loss_clip": 1.12946594, + "balance_loss_mlp": 1.03152561, + "epoch": 0.7708771718872122, + "flos": 17422587861120.0, + "grad_norm": 2.2036484978606916, + "language_loss": 0.7881462, + "learning_rate": 5.257231088298057e-07, + "loss": 0.81561077, + "num_input_tokens_seen": 137956370, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.296875, + "step": 6411, + "time_per_iteration": 3.0988550186157227 + }, + { + "auxiliary_loss_clip": 0.01432857, + "auxiliary_loss_mlp": 0.01186218, + "balance_loss_clip": 1.09433746, + "balance_loss_mlp": 1.00006104, + "epoch": 0.7709974147778512, + "flos": 72247380073920.0, + "grad_norm": 0.8034766227880047, + "language_loss": 0.53869647, + "learning_rate": 5.25196837002655e-07, + "loss": 0.56488723, + "num_input_tokens_seen": 138016080, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 1.859375, + "step": 6412, + "time_per_iteration": 3.4621737003326416 + }, + { + "auxiliary_loss_clip": 0.01487435, + "auxiliary_loss_mlp": 0.01262229, + "balance_loss_clip": 1.13257432, + "balance_loss_mlp": 1.03163028, + "epoch": 0.7711176576684904, + "flos": 39862893997440.0, + "grad_norm": 1.939029883291276, + "language_loss": 0.68396962, + "learning_rate": 5.24670788901971e-07, + "loss": 0.71146625, + "num_input_tokens_seen": 138039170, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.30273438, + "step": 6413, + "time_per_iteration": 4.025279760360718 + }, + { + "auxiliary_loss_clip": 0.01491421, + "auxiliary_loss_mlp": 0.01270645, + "balance_loss_clip": 1.13411617, + "balance_loss_mlp": 1.03585017, + "epoch": 0.7712379005591294, + "flos": 36979376852160.0, + "grad_norm": 2.1546217156461034, + "language_loss": 0.68511158, + "learning_rate": 5.241449646075557e-07, + "loss": 0.71273226, + "num_input_tokens_seen": 138062395, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.34570312, + "step": 6414, + "time_per_iteration": 3.131861925125122 + }, + { + "auxiliary_loss_clip": 0.01481505, + "auxiliary_loss_mlp": 0.01257225, + "balance_loss_clip": 1.12637985, + "balance_loss_mlp": 1.02643621, + "epoch": 0.7713581434497685, + "flos": 22778590674240.0, + "grad_norm": 2.3588726558812514, + "language_loss": 0.72778392, + "learning_rate": 5.236193641991762e-07, + "loss": 0.75517118, + "num_input_tokens_seen": 138080325, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.3046875, + "step": 6415, + "time_per_iteration": 2.9715189933776855 + }, + { + "auxiliary_loss_clip": 0.01480199, + "auxiliary_loss_mlp": 0.01257084, + "balance_loss_clip": 1.12398028, + "balance_loss_mlp": 1.02991867, + "epoch": 0.7714783863404077, + "flos": 24099755387040.0, + "grad_norm": 1.885102499590565, + "language_loss": 0.69655514, + "learning_rate": 5.23093987756565e-07, + "loss": 0.72392797, + "num_input_tokens_seen": 138099020, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.27148438, + "step": 6416, + "time_per_iteration": 2.9870030879974365 + }, + { + "auxiliary_loss_clip": 0.01478786, + "auxiliary_loss_mlp": 0.01259902, + "balance_loss_clip": 1.12213683, + "balance_loss_mlp": 1.02854037, + "epoch": 0.7715986292310467, + "flos": 21065472348480.0, + "grad_norm": 1.9688787550620213, + "language_loss": 0.7564075, + "learning_rate": 5.225688353594217e-07, + "loss": 0.7837944, + "num_input_tokens_seen": 138118650, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.31054688, + "step": 6417, + "time_per_iteration": 3.043996572494507 + }, + { + "auxiliary_loss_clip": 0.01479959, + "auxiliary_loss_mlp": 0.01254774, + "balance_loss_clip": 1.12306094, + "balance_loss_mlp": 1.02264977, + "epoch": 0.7717188721216858, + "flos": 20596978980000.0, + "grad_norm": 2.142517075377858, + "language_loss": 0.77472454, + "learning_rate": 5.220439070874108e-07, + "loss": 0.80207187, + "num_input_tokens_seen": 138137890, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.31640625, + "step": 6418, + "time_per_iteration": 2.962815999984741 + }, + { + "auxiliary_loss_clip": 0.01483076, + "auxiliary_loss_mlp": 0.01251922, + "balance_loss_clip": 1.12819624, + "balance_loss_mlp": 1.02208638, + "epoch": 0.7718391150123249, + "flos": 26253413663040.0, + "grad_norm": 1.6305687545361922, + "language_loss": 0.71400619, + "learning_rate": 5.215192030201652e-07, + "loss": 0.74135613, + "num_input_tokens_seen": 138158880, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.296875, + "step": 6419, + "time_per_iteration": 3.0386850833892822 + }, + { + "auxiliary_loss_clip": 0.01482292, + "auxiliary_loss_mlp": 0.01255269, + "balance_loss_clip": 1.12610197, + "balance_loss_mlp": 1.02772188, + "epoch": 0.771959357902964, + "flos": 22051690201440.0, + "grad_norm": 2.0356192227924006, + "language_loss": 0.86167097, + "learning_rate": 5.209947232372798e-07, + "loss": 0.88904661, + "num_input_tokens_seen": 138176370, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.27539062, + "step": 6420, + "time_per_iteration": 2.9575438499450684 + }, + { + "auxiliary_loss_clip": 0.01484907, + "auxiliary_loss_mlp": 0.01258325, + "balance_loss_clip": 1.12782812, + "balance_loss_mlp": 1.026582, + "epoch": 0.772079600793603, + "flos": 30448651409280.0, + "grad_norm": 2.115483580461813, + "language_loss": 0.81114328, + "learning_rate": 5.204704678183196e-07, + "loss": 0.8385756, + "num_input_tokens_seen": 138195105, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.3125, + "step": 6421, + "time_per_iteration": 3.0064914226531982 + }, + { + "auxiliary_loss_clip": 0.01482976, + "auxiliary_loss_mlp": 0.01261858, + "balance_loss_clip": 1.12769496, + "balance_loss_mlp": 1.03144991, + "epoch": 0.7721998436842422, + "flos": 12971560053600.0, + "grad_norm": 2.2878843919680434, + "language_loss": 0.84785253, + "learning_rate": 5.19946436842813e-07, + "loss": 0.87530088, + "num_input_tokens_seen": 138212235, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.30078125, + "step": 6422, + "time_per_iteration": 3.7374050617218018 + }, + { + "auxiliary_loss_clip": 0.01479009, + "auxiliary_loss_mlp": 0.01255533, + "balance_loss_clip": 1.12374425, + "balance_loss_mlp": 1.02760434, + "epoch": 0.7723200865748813, + "flos": 32638303873440.0, + "grad_norm": 1.573293975771619, + "language_loss": 0.68343788, + "learning_rate": 5.194226303902546e-07, + "loss": 0.7107833, + "num_input_tokens_seen": 138231970, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.28125, + "step": 6423, + "time_per_iteration": 3.0711424350738525 + }, + { + "auxiliary_loss_clip": 0.0147868, + "auxiliary_loss_mlp": 0.01261451, + "balance_loss_clip": 1.1229068, + "balance_loss_mlp": 1.03104329, + "epoch": 0.7724403294655203, + "flos": 21107838467520.0, + "grad_norm": 2.0970188262428113, + "language_loss": 0.70718467, + "learning_rate": 5.188990485401072e-07, + "loss": 0.734586, + "num_input_tokens_seen": 138251175, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.30078125, + "step": 6424, + "time_per_iteration": 3.1584885120391846 + }, + { + "auxiliary_loss_clip": 0.014809, + "auxiliary_loss_mlp": 0.01270471, + "balance_loss_clip": 1.12481976, + "balance_loss_mlp": 1.04292452, + "epoch": 0.7725605723561595, + "flos": 22092918475680.0, + "grad_norm": 4.76964474374201, + "language_loss": 0.86354327, + "learning_rate": 5.183756913717954e-07, + "loss": 0.89105701, + "num_input_tokens_seen": 138270950, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.27539062, + "step": 6425, + "time_per_iteration": 3.0546493530273438 + }, + { + "auxiliary_loss_clip": 0.01477029, + "auxiliary_loss_mlp": 0.0126042, + "balance_loss_clip": 1.12086916, + "balance_loss_mlp": 1.03001249, + "epoch": 0.7726808152467985, + "flos": 34498471132800.0, + "grad_norm": 1.883815763662984, + "language_loss": 0.73035598, + "learning_rate": 5.178525589647136e-07, + "loss": 0.75773048, + "num_input_tokens_seen": 138292590, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.30273438, + "step": 6426, + "time_per_iteration": 3.035945415496826 + }, + { + "auxiliary_loss_clip": 0.01478907, + "auxiliary_loss_mlp": 0.0126302, + "balance_loss_clip": 1.12334847, + "balance_loss_mlp": 1.03604507, + "epoch": 0.7728010581374376, + "flos": 22308352610400.0, + "grad_norm": 2.2339390608200556, + "language_loss": 0.79059458, + "learning_rate": 5.173296513982197e-07, + "loss": 0.81801391, + "num_input_tokens_seen": 138311115, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.26953125, + "step": 6427, + "time_per_iteration": 3.0433030128479004 + }, + { + "auxiliary_loss_clip": 0.01482964, + "auxiliary_loss_mlp": 0.01273942, + "balance_loss_clip": 1.1260159, + "balance_loss_mlp": 1.04410648, + "epoch": 0.7729213010280768, + "flos": 27128956052160.0, + "grad_norm": 3.05304936958804, + "language_loss": 0.65109503, + "learning_rate": 5.168069687516398e-07, + "loss": 0.67866409, + "num_input_tokens_seen": 138330885, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.296875, + "step": 6428, + "time_per_iteration": 3.00876522064209 + }, + { + "auxiliary_loss_clip": 0.0148401, + "auxiliary_loss_mlp": 0.01260631, + "balance_loss_clip": 1.12881422, + "balance_loss_mlp": 1.02984214, + "epoch": 0.7730415439187158, + "flos": 18152181233280.0, + "grad_norm": 2.0367400300695384, + "language_loss": 0.71891522, + "learning_rate": 5.16284511104263e-07, + "loss": 0.74636167, + "num_input_tokens_seen": 138350020, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.3046875, + "step": 6429, + "time_per_iteration": 3.0380256175994873 + }, + { + "auxiliary_loss_clip": 0.0148428, + "auxiliary_loss_mlp": 0.01266566, + "balance_loss_clip": 1.12835622, + "balance_loss_mlp": 1.03863776, + "epoch": 0.7731617868093549, + "flos": 11949082515360.0, + "grad_norm": 2.252219606675312, + "language_loss": 0.81328505, + "learning_rate": 5.157622785353457e-07, + "loss": 0.84079349, + "num_input_tokens_seen": 138368135, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.28125, + "step": 6430, + "time_per_iteration": 3.0109546184539795 + }, + { + "auxiliary_loss_clip": 0.01433149, + "auxiliary_loss_mlp": 0.01196373, + "balance_loss_clip": 1.09415007, + "balance_loss_mlp": 1.00868988, + "epoch": 0.7732820296999939, + "flos": 64207932981120.0, + "grad_norm": 0.6611910632018148, + "language_loss": 0.60286772, + "learning_rate": 5.152402711241113e-07, + "loss": 0.62916303, + "num_input_tokens_seen": 138436040, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.875, + "step": 6431, + "time_per_iteration": 3.4893617630004883 + }, + { + "auxiliary_loss_clip": 0.01480103, + "auxiliary_loss_mlp": 0.01261448, + "balance_loss_clip": 1.12389708, + "balance_loss_mlp": 1.03275633, + "epoch": 0.7734022725906331, + "flos": 25304593340160.0, + "grad_norm": 1.852762865272873, + "language_loss": 0.83098119, + "learning_rate": 5.147184889497465e-07, + "loss": 0.85839665, + "num_input_tokens_seen": 138455510, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.28710938, + "step": 6432, + "time_per_iteration": 3.100248336791992 + }, + { + "auxiliary_loss_clip": 0.01481129, + "auxiliary_loss_mlp": 0.01259927, + "balance_loss_clip": 1.12595046, + "balance_loss_mlp": 1.02990079, + "epoch": 0.7735225154812722, + "flos": 17349347855520.0, + "grad_norm": 2.68673122577372, + "language_loss": 0.79920971, + "learning_rate": 5.141969320914072e-07, + "loss": 0.82662022, + "num_input_tokens_seen": 138473015, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.29882812, + "step": 6433, + "time_per_iteration": 3.036653757095337 + }, + { + "auxiliary_loss_clip": 0.01483204, + "auxiliary_loss_mlp": 0.01268009, + "balance_loss_clip": 1.12623262, + "balance_loss_mlp": 1.03664732, + "epoch": 0.7736427583719112, + "flos": 32632576721280.0, + "grad_norm": 2.9904854591962913, + "language_loss": 0.6296407, + "learning_rate": 5.136756006282113e-07, + "loss": 0.65715289, + "num_input_tokens_seen": 138491680, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.31054688, + "step": 6434, + "time_per_iteration": 3.1138367652893066 + }, + { + "auxiliary_loss_clip": 0.01477982, + "auxiliary_loss_mlp": 0.01263392, + "balance_loss_clip": 1.12140095, + "balance_loss_mlp": 1.03431964, + "epoch": 0.7737630012625504, + "flos": 19861848096480.0, + "grad_norm": 2.3461147784560565, + "language_loss": 0.85162902, + "learning_rate": 5.131544946392446e-07, + "loss": 0.87904274, + "num_input_tokens_seen": 138506960, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.29101562, + "step": 6435, + "time_per_iteration": 3.8692986965179443 + }, + { + "auxiliary_loss_clip": 0.014804, + "auxiliary_loss_mlp": 0.01262418, + "balance_loss_clip": 1.12478232, + "balance_loss_mlp": 1.02972114, + "epoch": 0.7738832441531894, + "flos": 36025284515040.0, + "grad_norm": 2.218989150551872, + "language_loss": 0.63862705, + "learning_rate": 5.126336142035592e-07, + "loss": 0.66605532, + "num_input_tokens_seen": 138526995, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.32421875, + "step": 6436, + "time_per_iteration": 3.1140296459198 + }, + { + "auxiliary_loss_clip": 0.01481771, + "auxiliary_loss_mlp": 0.01262226, + "balance_loss_clip": 1.12642312, + "balance_loss_mlp": 1.02991068, + "epoch": 0.7740034870438285, + "flos": 13406865917760.0, + "grad_norm": 2.844694268610164, + "language_loss": 0.72366416, + "learning_rate": 5.121129594001721e-07, + "loss": 0.75110412, + "num_input_tokens_seen": 138541260, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.3203125, + "step": 6437, + "time_per_iteration": 3.8251571655273438 + }, + { + "auxiliary_loss_clip": 0.01483978, + "auxiliary_loss_mlp": 0.01262861, + "balance_loss_clip": 1.12729692, + "balance_loss_mlp": 1.03512335, + "epoch": 0.7741237299344677, + "flos": 22088670521760.0, + "grad_norm": 2.2025421484615855, + "language_loss": 0.81144476, + "learning_rate": 5.115925303080661e-07, + "loss": 0.8389132, + "num_input_tokens_seen": 138560970, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.27734375, + "step": 6438, + "time_per_iteration": 3.0066850185394287 + }, + { + "auxiliary_loss_clip": 0.01477107, + "auxiliary_loss_mlp": 0.0126254, + "balance_loss_clip": 1.12075806, + "balance_loss_mlp": 1.03308606, + "epoch": 0.7742439728251067, + "flos": 19866816685440.0, + "grad_norm": 1.968100829756219, + "language_loss": 0.79027629, + "learning_rate": 5.110723270061899e-07, + "loss": 0.81767273, + "num_input_tokens_seen": 138577460, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.29296875, + "step": 6439, + "time_per_iteration": 2.9437508583068848 + }, + { + "auxiliary_loss_clip": 0.01476265, + "auxiliary_loss_mlp": 0.01254419, + "balance_loss_clip": 1.11902547, + "balance_loss_mlp": 1.02858949, + "epoch": 0.7743642157157458, + "flos": 16692008356800.0, + "grad_norm": 2.455927321025395, + "language_loss": 0.79421186, + "learning_rate": 5.105523495734572e-07, + "loss": 0.82151866, + "num_input_tokens_seen": 138594860, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.26171875, + "step": 6440, + "time_per_iteration": 3.834916114807129 + }, + { + "auxiliary_loss_clip": 0.01476527, + "auxiliary_loss_mlp": 0.01253674, + "balance_loss_clip": 1.11944854, + "balance_loss_mlp": 1.02326632, + "epoch": 0.7744844586063849, + "flos": 20306522216160.0, + "grad_norm": 1.7219349163708615, + "language_loss": 0.75152516, + "learning_rate": 5.100325980887499e-07, + "loss": 0.77882713, + "num_input_tokens_seen": 138614785, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.30273438, + "step": 6441, + "time_per_iteration": 3.0272626876831055 + }, + { + "auxiliary_loss_clip": 0.01480514, + "auxiliary_loss_mlp": 0.01259986, + "balance_loss_clip": 1.12426043, + "balance_loss_mlp": 1.0318675, + "epoch": 0.774604701497024, + "flos": 22968953930880.0, + "grad_norm": 1.9821871838283305, + "language_loss": 0.83492714, + "learning_rate": 5.095130726309116e-07, + "loss": 0.86233211, + "num_input_tokens_seen": 138634960, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.27929688, + "step": 6442, + "time_per_iteration": 2.9864437580108643 + }, + { + "auxiliary_loss_clip": 0.01430718, + "auxiliary_loss_mlp": 0.01193954, + "balance_loss_clip": 1.09097719, + "balance_loss_mlp": 1.00741577, + "epoch": 0.774724944387663, + "flos": 60294731947200.0, + "grad_norm": 0.7902225308893533, + "language_loss": 0.58945233, + "learning_rate": 5.089937732787559e-07, + "loss": 0.61569905, + "num_input_tokens_seen": 138699520, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 1.86328125, + "step": 6443, + "time_per_iteration": 3.562218427658081 + }, + { + "auxiliary_loss_clip": 0.01481937, + "auxiliary_loss_mlp": 0.01264662, + "balance_loss_clip": 1.1251502, + "balance_loss_mlp": 1.03577995, + "epoch": 0.7748451872783022, + "flos": 26763476659200.0, + "grad_norm": 2.2857292268409957, + "language_loss": 0.66168302, + "learning_rate": 5.084747001110592e-07, + "loss": 0.68914902, + "num_input_tokens_seen": 138719145, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.28710938, + "step": 6444, + "time_per_iteration": 3.1308112144470215 + }, + { + "auxiliary_loss_clip": 0.01476484, + "auxiliary_loss_mlp": 0.01249639, + "balance_loss_clip": 1.11979437, + "balance_loss_mlp": 1.02380919, + "epoch": 0.7749654301689413, + "flos": 30341389479840.0, + "grad_norm": 2.540679909409911, + "language_loss": 0.7051242, + "learning_rate": 5.07955853206564e-07, + "loss": 0.7323854, + "num_input_tokens_seen": 138743850, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.2578125, + "step": 6445, + "time_per_iteration": 3.067073345184326 + }, + { + "auxiliary_loss_clip": 0.01479332, + "auxiliary_loss_mlp": 0.01259821, + "balance_loss_clip": 1.12219334, + "balance_loss_mlp": 1.02998579, + "epoch": 0.7750856730595803, + "flos": 43183234133280.0, + "grad_norm": 2.0092492149524777, + "language_loss": 0.71384907, + "learning_rate": 5.074372326439807e-07, + "loss": 0.7412405, + "num_input_tokens_seen": 138766860, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.29492188, + "step": 6446, + "time_per_iteration": 3.1969850063323975 + }, + { + "auxiliary_loss_clip": 0.01473493, + "auxiliary_loss_mlp": 0.01263507, + "balance_loss_clip": 1.11709976, + "balance_loss_mlp": 1.03214538, + "epoch": 0.7752059159502195, + "flos": 17642004452640.0, + "grad_norm": 2.1727973785176893, + "language_loss": 0.73869801, + "learning_rate": 5.069188385019814e-07, + "loss": 0.76606798, + "num_input_tokens_seen": 138784560, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.31054688, + "step": 6447, + "time_per_iteration": 2.98286509513855 + }, + { + "auxiliary_loss_clip": 0.01480029, + "auxiliary_loss_mlp": 0.01256293, + "balance_loss_clip": 1.1225282, + "balance_loss_mlp": 1.02493179, + "epoch": 0.7753261588408585, + "flos": 12679358594400.0, + "grad_norm": 3.966528885147541, + "language_loss": 0.61197054, + "learning_rate": 5.064006708592077e-07, + "loss": 0.63933372, + "num_input_tokens_seen": 138800805, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.30859375, + "step": 6448, + "time_per_iteration": 2.965144157409668 + }, + { + "auxiliary_loss_clip": 0.01482689, + "auxiliary_loss_mlp": 0.01262215, + "balance_loss_clip": 1.12638712, + "balance_loss_mlp": 1.03447795, + "epoch": 0.7754464017314976, + "flos": 16692615207360.0, + "grad_norm": 2.9076186462867586, + "language_loss": 0.7576884, + "learning_rate": 5.058827297942641e-07, + "loss": 0.78513747, + "num_input_tokens_seen": 138815910, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.27734375, + "step": 6449, + "time_per_iteration": 3.885596752166748 + }, + { + "auxiliary_loss_clip": 0.01479175, + "auxiliary_loss_mlp": 0.01255385, + "balance_loss_clip": 1.1230166, + "balance_loss_mlp": 1.02783847, + "epoch": 0.7755666446221368, + "flos": 19720981452960.0, + "grad_norm": 2.256786187962036, + "language_loss": 0.75214899, + "learning_rate": 5.053650153857237e-07, + "loss": 0.77949452, + "num_input_tokens_seen": 138834920, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.27734375, + "step": 6450, + "time_per_iteration": 3.126777172088623 + }, + { + "auxiliary_loss_clip": 0.01479962, + "auxiliary_loss_mlp": 0.01255814, + "balance_loss_clip": 1.12176239, + "balance_loss_mlp": 1.02864909, + "epoch": 0.7756868875127758, + "flos": 18695393805600.0, + "grad_norm": 1.7524817902586072, + "language_loss": 0.70026797, + "learning_rate": 5.048475277121214e-07, + "loss": 0.72762573, + "num_input_tokens_seen": 138852135, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.2734375, + "step": 6451, + "time_per_iteration": 2.9270949363708496 + }, + { + "auxiliary_loss_clip": 0.01474754, + "auxiliary_loss_mlp": 0.01258272, + "balance_loss_clip": 1.11762023, + "balance_loss_mlp": 1.02996218, + "epoch": 0.7758071304034149, + "flos": 28406806441920.0, + "grad_norm": 2.0409752863345165, + "language_loss": 0.77094424, + "learning_rate": 5.043302668519598e-07, + "loss": 0.79827446, + "num_input_tokens_seen": 138871470, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.28320312, + "step": 6452, + "time_per_iteration": 3.0121142864227295 + }, + { + "auxiliary_loss_clip": 0.01479323, + "auxiliary_loss_mlp": 0.01256485, + "balance_loss_clip": 1.12285042, + "balance_loss_mlp": 1.02607727, + "epoch": 0.775927373294054, + "flos": 20597472046080.0, + "grad_norm": 1.9224489638329465, + "language_loss": 0.72365355, + "learning_rate": 5.038132328837079e-07, + "loss": 0.75101167, + "num_input_tokens_seen": 138889860, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.3046875, + "step": 6453, + "time_per_iteration": 3.0128557682037354 + }, + { + "auxiliary_loss_clip": 0.01484325, + "auxiliary_loss_mlp": 0.01255557, + "balance_loss_clip": 1.12838364, + "balance_loss_mlp": 1.02495849, + "epoch": 0.7760476161846931, + "flos": 22528451908800.0, + "grad_norm": 2.3954685963790827, + "language_loss": 0.73950231, + "learning_rate": 5.032964258857993e-07, + "loss": 0.76690114, + "num_input_tokens_seen": 138909955, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.3046875, + "step": 6454, + "time_per_iteration": 3.016268014907837 + }, + { + "auxiliary_loss_clip": 0.01474362, + "auxiliary_loss_mlp": 0.01263024, + "balance_loss_clip": 1.11766648, + "balance_loss_mlp": 1.03395152, + "epoch": 0.7761678590753321, + "flos": 48655108568160.0, + "grad_norm": 1.6625506742308191, + "language_loss": 0.68454266, + "learning_rate": 5.027798459366329e-07, + "loss": 0.71191657, + "num_input_tokens_seen": 138935320, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.29296875, + "step": 6455, + "time_per_iteration": 3.199140787124634 + }, + { + "auxiliary_loss_clip": 0.01479628, + "auxiliary_loss_mlp": 0.01255568, + "balance_loss_clip": 1.12250888, + "balance_loss_mlp": 1.02401543, + "epoch": 0.7762881019659713, + "flos": 26179453022400.0, + "grad_norm": 1.4950525496149556, + "language_loss": 0.63530719, + "learning_rate": 5.02263493114573e-07, + "loss": 0.66265917, + "num_input_tokens_seen": 138957115, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.31054688, + "step": 6456, + "time_per_iteration": 3.0456552505493164 + }, + { + "auxiliary_loss_clip": 0.01475052, + "auxiliary_loss_mlp": 0.0125598, + "balance_loss_clip": 1.11846828, + "balance_loss_mlp": 1.02881479, + "epoch": 0.7764083448566104, + "flos": 20590076054880.0, + "grad_norm": 2.298387715503108, + "language_loss": 0.77120441, + "learning_rate": 5.017473674979502e-07, + "loss": 0.79851472, + "num_input_tokens_seen": 138973140, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.27148438, + "step": 6457, + "time_per_iteration": 3.0258450508117676 + }, + { + "auxiliary_loss_clip": 0.01432091, + "auxiliary_loss_mlp": 0.01191773, + "balance_loss_clip": 1.09157979, + "balance_loss_mlp": 1.00561523, + "epoch": 0.7765285877472494, + "flos": 67299639982560.0, + "grad_norm": 0.7489573127209309, + "language_loss": 0.58344996, + "learning_rate": 5.01231469165061e-07, + "loss": 0.60968864, + "num_input_tokens_seen": 139028965, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.859375, + "step": 6458, + "time_per_iteration": 3.3308017253875732 + }, + { + "auxiliary_loss_clip": 0.01432884, + "auxiliary_loss_mlp": 0.01192299, + "balance_loss_clip": 1.09233057, + "balance_loss_mlp": 1.00652313, + "epoch": 0.7766488306378886, + "flos": 61350624558720.0, + "grad_norm": 0.8234499215031661, + "language_loss": 0.56814998, + "learning_rate": 5.007157981941663e-07, + "loss": 0.59440178, + "num_input_tokens_seen": 139094325, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.85546875, + "step": 6459, + "time_per_iteration": 3.4386136531829834 + }, + { + "auxiliary_loss_clip": 0.01431219, + "auxiliary_loss_mlp": 0.01195015, + "balance_loss_clip": 1.09106135, + "balance_loss_mlp": 1.00885773, + "epoch": 0.7767690735285276, + "flos": 62952233001120.0, + "grad_norm": 1.1368520898024648, + "language_loss": 0.6742754, + "learning_rate": 5.002003546634928e-07, + "loss": 0.70053768, + "num_input_tokens_seen": 139150425, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.859375, + "step": 6460, + "time_per_iteration": 3.338068962097168 + }, + { + "auxiliary_loss_clip": 0.01477387, + "auxiliary_loss_mlp": 0.01258371, + "balance_loss_clip": 1.11986172, + "balance_loss_mlp": 1.02929878, + "epoch": 0.7768893164191667, + "flos": 20888497732320.0, + "grad_norm": 1.8411831478373653, + "language_loss": 0.76170069, + "learning_rate": 4.996851386512331e-07, + "loss": 0.78905833, + "num_input_tokens_seen": 139169130, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.29101562, + "step": 6461, + "time_per_iteration": 3.0535354614257812 + }, + { + "auxiliary_loss_clip": 0.0147613, + "auxiliary_loss_mlp": 0.01264388, + "balance_loss_clip": 1.1195209, + "balance_loss_mlp": 1.03092885, + "epoch": 0.7770095593098058, + "flos": 20706706239840.0, + "grad_norm": 2.143841929592168, + "language_loss": 0.83380675, + "learning_rate": 4.991701502355444e-07, + "loss": 0.8612119, + "num_input_tokens_seen": 139189595, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.33007812, + "step": 6462, + "time_per_iteration": 3.012338638305664 + }, + { + "auxiliary_loss_clip": 0.01474427, + "auxiliary_loss_mlp": 0.01253388, + "balance_loss_clip": 1.11740518, + "balance_loss_mlp": 1.02336192, + "epoch": 0.7771298022004449, + "flos": 24720038709120.0, + "grad_norm": 2.9080681620904056, + "language_loss": 0.75988638, + "learning_rate": 4.986553894945518e-07, + "loss": 0.78716445, + "num_input_tokens_seen": 139210805, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.29492188, + "step": 6463, + "time_per_iteration": 3.92220139503479 + }, + { + "auxiliary_loss_clip": 0.01476897, + "auxiliary_loss_mlp": 0.0125684, + "balance_loss_clip": 1.1197263, + "balance_loss_mlp": 1.03081894, + "epoch": 0.777250045091084, + "flos": 25011557461440.0, + "grad_norm": 2.240569687142155, + "language_loss": 0.86580884, + "learning_rate": 4.981408565063416e-07, + "loss": 0.89314616, + "num_input_tokens_seen": 139230750, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.25976562, + "step": 6464, + "time_per_iteration": 3.9223432540893555 + }, + { + "auxiliary_loss_clip": 0.01485337, + "auxiliary_loss_mlp": 0.01264742, + "balance_loss_clip": 1.1284827, + "balance_loss_mlp": 1.03414345, + "epoch": 0.777370287981723, + "flos": 20121924039840.0, + "grad_norm": 2.6611858647713302, + "language_loss": 0.76180393, + "learning_rate": 4.976265513489701e-07, + "loss": 0.78930473, + "num_input_tokens_seen": 139250720, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.30273438, + "step": 6465, + "time_per_iteration": 2.947376251220703 + }, + { + "auxiliary_loss_clip": 0.0148367, + "auxiliary_loss_mlp": 0.01257538, + "balance_loss_clip": 1.12649584, + "balance_loss_mlp": 1.02922869, + "epoch": 0.7774905308723622, + "flos": 21720574085760.0, + "grad_norm": 2.4718993713901143, + "language_loss": 0.80753136, + "learning_rate": 4.971124741004562e-07, + "loss": 0.83494347, + "num_input_tokens_seen": 139269720, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.28320312, + "step": 6466, + "time_per_iteration": 3.099292039871216 + }, + { + "auxiliary_loss_clip": 0.01483633, + "auxiliary_loss_mlp": 0.01259106, + "balance_loss_clip": 1.12696433, + "balance_loss_mlp": 1.0309875, + "epoch": 0.7776107737630013, + "flos": 16035958415520.0, + "grad_norm": 1.9415802653284013, + "language_loss": 0.76497465, + "learning_rate": 4.965986248387846e-07, + "loss": 0.79240203, + "num_input_tokens_seen": 139288035, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.28320312, + "step": 6467, + "time_per_iteration": 3.070560932159424 + }, + { + "auxiliary_loss_clip": 0.0147865, + "auxiliary_loss_mlp": 0.01256163, + "balance_loss_clip": 1.12276208, + "balance_loss_mlp": 1.02766311, + "epoch": 0.7777310166536403, + "flos": 24793240786560.0, + "grad_norm": 1.7992505019433505, + "language_loss": 0.76921499, + "learning_rate": 4.960850036419073e-07, + "loss": 0.79656315, + "num_input_tokens_seen": 139307135, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.28320312, + "step": 6468, + "time_per_iteration": 3.9385902881622314 + }, + { + "auxiliary_loss_clip": 0.01477291, + "auxiliary_loss_mlp": 0.01266378, + "balance_loss_clip": 1.1213758, + "balance_loss_mlp": 1.03673327, + "epoch": 0.7778512595442795, + "flos": 17274325226400.0, + "grad_norm": 3.0073155218737293, + "language_loss": 0.7873897, + "learning_rate": 4.955716105877378e-07, + "loss": 0.81482643, + "num_input_tokens_seen": 139325905, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.296875, + "step": 6469, + "time_per_iteration": 2.999533176422119 + }, + { + "auxiliary_loss_clip": 0.01484279, + "auxiliary_loss_mlp": 0.01254202, + "balance_loss_clip": 1.12952304, + "balance_loss_mlp": 1.02532005, + "epoch": 0.7779715024349185, + "flos": 17750328370560.0, + "grad_norm": 1.8170255351268432, + "language_loss": 0.83164287, + "learning_rate": 4.950584457541598e-07, + "loss": 0.85902762, + "num_input_tokens_seen": 139344370, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.2890625, + "step": 6470, + "time_per_iteration": 2.982471227645874 + }, + { + "auxiliary_loss_clip": 0.01479446, + "auxiliary_loss_mlp": 0.01249047, + "balance_loss_clip": 1.12288809, + "balance_loss_mlp": 1.02188182, + "epoch": 0.7780917453255576, + "flos": 24319209906720.0, + "grad_norm": 2.4539253263898644, + "language_loss": 0.8201741, + "learning_rate": 4.945455092190183e-07, + "loss": 0.84745908, + "num_input_tokens_seen": 139365625, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.2734375, + "step": 6471, + "time_per_iteration": 2.9673168659210205 + }, + { + "auxiliary_loss_clip": 0.01438968, + "auxiliary_loss_mlp": 0.01196243, + "balance_loss_clip": 1.09816003, + "balance_loss_mlp": 1.00932312, + "epoch": 0.7782119882161967, + "flos": 56371517514720.0, + "grad_norm": 0.6828591493928257, + "language_loss": 0.55920851, + "learning_rate": 4.940328010601271e-07, + "loss": 0.58556056, + "num_input_tokens_seen": 139430540, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.8671875, + "step": 6472, + "time_per_iteration": 3.4682374000549316 + }, + { + "auxiliary_loss_clip": 0.01489774, + "auxiliary_loss_mlp": 0.01269632, + "balance_loss_clip": 1.13348484, + "balance_loss_mlp": 1.03827071, + "epoch": 0.7783322311068358, + "flos": 46793462110560.0, + "grad_norm": 2.208388445193746, + "language_loss": 0.76919675, + "learning_rate": 4.935203213552621e-07, + "loss": 0.79679084, + "num_input_tokens_seen": 139454280, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.30664062, + "step": 6473, + "time_per_iteration": 3.2028510570526123 + }, + { + "auxiliary_loss_clip": 0.01479462, + "auxiliary_loss_mlp": 0.01255158, + "balance_loss_clip": 1.12207556, + "balance_loss_mlp": 1.02684844, + "epoch": 0.7784524739974749, + "flos": 19059621569280.0, + "grad_norm": 3.093243669944255, + "language_loss": 0.66869146, + "learning_rate": 4.930080701821662e-07, + "loss": 0.69603765, + "num_input_tokens_seen": 139471745, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.28125, + "step": 6474, + "time_per_iteration": 2.9590725898742676 + }, + { + "auxiliary_loss_clip": 0.01475153, + "auxiliary_loss_mlp": 0.01256414, + "balance_loss_clip": 1.11920667, + "balance_loss_mlp": 1.02600598, + "epoch": 0.778572716888114, + "flos": 24793468355520.0, + "grad_norm": 1.941223722494661, + "language_loss": 0.77263254, + "learning_rate": 4.92496047618548e-07, + "loss": 0.79994822, + "num_input_tokens_seen": 139491505, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.29882812, + "step": 6475, + "time_per_iteration": 3.033087730407715 + }, + { + "auxiliary_loss_clip": 0.01485868, + "auxiliary_loss_mlp": 0.01259357, + "balance_loss_clip": 1.13052535, + "balance_loss_mlp": 1.0268513, + "epoch": 0.7786929597787531, + "flos": 20079823417920.0, + "grad_norm": 1.9814955175835938, + "language_loss": 0.7762804, + "learning_rate": 4.919842537420811e-07, + "loss": 0.80373275, + "num_input_tokens_seen": 139508620, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 2.32226562, + "step": 6476, + "time_per_iteration": 3.791435718536377 + }, + { + "auxiliary_loss_clip": 0.01485728, + "auxiliary_loss_mlp": 0.01253959, + "balance_loss_clip": 1.13116074, + "balance_loss_mlp": 1.02603126, + "epoch": 0.7788132026693921, + "flos": 21874412160000.0, + "grad_norm": 2.3165232632598456, + "language_loss": 0.79142308, + "learning_rate": 4.91472688630404e-07, + "loss": 0.81881994, + "num_input_tokens_seen": 139529360, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.28125, + "step": 6477, + "time_per_iteration": 3.088080406188965 + }, + { + "auxiliary_loss_clip": 0.01472522, + "auxiliary_loss_mlp": 0.01252367, + "balance_loss_clip": 1.11583745, + "balance_loss_mlp": 1.02386665, + "epoch": 0.7789334455600313, + "flos": 11183532883200.0, + "grad_norm": 2.852300125495765, + "language_loss": 0.73996806, + "learning_rate": 4.909613523611202e-07, + "loss": 0.76721692, + "num_input_tokens_seen": 139546240, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.28320312, + "step": 6478, + "time_per_iteration": 3.0377540588378906 + }, + { + "auxiliary_loss_clip": 0.01480017, + "auxiliary_loss_mlp": 0.01258746, + "balance_loss_clip": 1.12345052, + "balance_loss_mlp": 1.02757525, + "epoch": 0.7790536884506704, + "flos": 28698021768960.0, + "grad_norm": 1.6895184977880977, + "language_loss": 0.74850589, + "learning_rate": 4.904502450117991e-07, + "loss": 0.77589351, + "num_input_tokens_seen": 139567200, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.30859375, + "step": 6479, + "time_per_iteration": 2.994947910308838 + }, + { + "auxiliary_loss_clip": 0.01480581, + "auxiliary_loss_mlp": 0.0126328, + "balance_loss_clip": 1.12428999, + "balance_loss_mlp": 1.0336349, + "epoch": 0.7791739313413094, + "flos": 11073881479680.0, + "grad_norm": 5.108098281699593, + "language_loss": 0.7180559, + "learning_rate": 4.899393666599762e-07, + "loss": 0.74549448, + "num_input_tokens_seen": 139583775, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.29492188, + "step": 6480, + "time_per_iteration": 3.0204427242279053 + }, + { + "auxiliary_loss_clip": 0.0147308, + "auxiliary_loss_mlp": 0.01256985, + "balance_loss_clip": 1.11560035, + "balance_loss_mlp": 1.02791214, + "epoch": 0.7792941742319486, + "flos": 14680771778880.0, + "grad_norm": 2.2782320935883162, + "language_loss": 0.72703391, + "learning_rate": 4.894287173831506e-07, + "loss": 0.75433457, + "num_input_tokens_seen": 139599735, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.2890625, + "step": 6481, + "time_per_iteration": 2.9644572734832764 + }, + { + "auxiliary_loss_clip": 0.01476129, + "auxiliary_loss_mlp": 0.01270319, + "balance_loss_clip": 1.12104738, + "balance_loss_mlp": 1.03914785, + "epoch": 0.7794144171225876, + "flos": 23261155390080.0, + "grad_norm": 2.801700803877274, + "language_loss": 0.8431468, + "learning_rate": 4.889182972587877e-07, + "loss": 0.87061119, + "num_input_tokens_seen": 139619030, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.30859375, + "step": 6482, + "time_per_iteration": 3.0284295082092285 + }, + { + "auxiliary_loss_clip": 0.01488546, + "auxiliary_loss_mlp": 0.01254765, + "balance_loss_clip": 1.13228905, + "balance_loss_mlp": 1.0272181, + "epoch": 0.7795346600132267, + "flos": 21509236192320.0, + "grad_norm": 2.1991398104625457, + "language_loss": 0.66484684, + "learning_rate": 4.884081063643177e-07, + "loss": 0.69227993, + "num_input_tokens_seen": 139637690, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.27734375, + "step": 6483, + "time_per_iteration": 3.0355124473571777 + }, + { + "auxiliary_loss_clip": 0.01437338, + "auxiliary_loss_mlp": 0.01196343, + "balance_loss_clip": 1.09766746, + "balance_loss_mlp": 1.01056671, + "epoch": 0.7796549029038659, + "flos": 70058220675840.0, + "grad_norm": 0.8648076181480048, + "language_loss": 0.52430594, + "learning_rate": 4.878981447771353e-07, + "loss": 0.55064273, + "num_input_tokens_seen": 139692070, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.85546875, + "step": 6484, + "time_per_iteration": 3.4392359256744385 + }, + { + "auxiliary_loss_clip": 0.01485015, + "auxiliary_loss_mlp": 0.01253304, + "balance_loss_clip": 1.12865496, + "balance_loss_mlp": 1.02213359, + "epoch": 0.7797751457945049, + "flos": 23991772822560.0, + "grad_norm": 1.9022793605757118, + "language_loss": 0.73291141, + "learning_rate": 4.873884125746035e-07, + "loss": 0.76029462, + "num_input_tokens_seen": 139713745, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.31054688, + "step": 6485, + "time_per_iteration": 3.0471858978271484 + }, + { + "auxiliary_loss_clip": 0.01478399, + "auxiliary_loss_mlp": 0.01261729, + "balance_loss_clip": 1.12141061, + "balance_loss_mlp": 1.03494525, + "epoch": 0.779895388685144, + "flos": 22676980040640.0, + "grad_norm": 2.250379590557882, + "language_loss": 0.72521687, + "learning_rate": 4.868789098340456e-07, + "loss": 0.75261819, + "num_input_tokens_seen": 139731650, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.26953125, + "step": 6486, + "time_per_iteration": 3.0097057819366455 + }, + { + "auxiliary_loss_clip": 0.01478699, + "auxiliary_loss_mlp": 0.01260718, + "balance_loss_clip": 1.12275267, + "balance_loss_mlp": 1.03202701, + "epoch": 0.7800156315757831, + "flos": 23770952889120.0, + "grad_norm": 2.4134169862748887, + "language_loss": 0.73191524, + "learning_rate": 4.863696366327543e-07, + "loss": 0.75930941, + "num_input_tokens_seen": 139750820, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.28320312, + "step": 6487, + "time_per_iteration": 2.993903875350952 + }, + { + "auxiliary_loss_clip": 0.01474665, + "auxiliary_loss_mlp": 0.01257983, + "balance_loss_clip": 1.11728942, + "balance_loss_mlp": 1.0266217, + "epoch": 0.7801358744664222, + "flos": 26431981261920.0, + "grad_norm": 1.8072239574147924, + "language_loss": 0.77991217, + "learning_rate": 4.85860593047986e-07, + "loss": 0.8072387, + "num_input_tokens_seen": 139770885, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.30859375, + "step": 6488, + "time_per_iteration": 3.037736654281616 + }, + { + "auxiliary_loss_clip": 0.0147949, + "auxiliary_loss_mlp": 0.0125515, + "balance_loss_clip": 1.12203562, + "balance_loss_mlp": 1.02664995, + "epoch": 0.7802561173570612, + "flos": 26324529691680.0, + "grad_norm": 1.6063435860148967, + "language_loss": 0.7488699, + "learning_rate": 4.853517791569613e-07, + "loss": 0.77621627, + "num_input_tokens_seen": 139793065, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.28515625, + "step": 6489, + "time_per_iteration": 2.997849225997925 + }, + { + "auxiliary_loss_clip": 0.01477236, + "auxiliary_loss_mlp": 0.01265355, + "balance_loss_clip": 1.12035549, + "balance_loss_mlp": 1.03494763, + "epoch": 0.7803763602477004, + "flos": 40336962805440.0, + "grad_norm": 2.824075937237646, + "language_loss": 0.66285729, + "learning_rate": 4.848431950368684e-07, + "loss": 0.69028318, + "num_input_tokens_seen": 139815625, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.30273438, + "step": 6490, + "time_per_iteration": 3.1929235458374023 + }, + { + "auxiliary_loss_clip": 0.01435076, + "auxiliary_loss_mlp": 0.01197464, + "balance_loss_clip": 1.09548283, + "balance_loss_mlp": 1.01130676, + "epoch": 0.7804966031383395, + "flos": 67008007445760.0, + "grad_norm": 0.7153010264690449, + "language_loss": 0.55663604, + "learning_rate": 4.843348407648569e-07, + "loss": 0.58296144, + "num_input_tokens_seen": 139876905, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 1.859375, + "step": 6491, + "time_per_iteration": 5.129528284072876 + }, + { + "auxiliary_loss_clip": 0.01472762, + "auxiliary_loss_mlp": 0.01260309, + "balance_loss_clip": 1.11550283, + "balance_loss_mlp": 1.03199959, + "epoch": 0.7806168460289785, + "flos": 17742211744320.0, + "grad_norm": 4.003419761045221, + "language_loss": 0.83258414, + "learning_rate": 4.838267164180457e-07, + "loss": 0.8599149, + "num_input_tokens_seen": 139892575, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.27929688, + "step": 6492, + "time_per_iteration": 2.9433648586273193 + }, + { + "auxiliary_loss_clip": 0.01480305, + "auxiliary_loss_mlp": 0.0126987, + "balance_loss_clip": 1.1238234, + "balance_loss_mlp": 1.03850865, + "epoch": 0.7807370889196176, + "flos": 23948230930560.0, + "grad_norm": 1.9466150645975464, + "language_loss": 0.83832204, + "learning_rate": 4.833188220735156e-07, + "loss": 0.86582375, + "num_input_tokens_seen": 139912245, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.31054688, + "step": 6493, + "time_per_iteration": 2.9662766456604004 + }, + { + "auxiliary_loss_clip": 0.01474282, + "auxiliary_loss_mlp": 0.01260186, + "balance_loss_clip": 1.11825562, + "balance_loss_mlp": 1.03111327, + "epoch": 0.7808573318102567, + "flos": 18991008799200.0, + "grad_norm": 2.1428955711784625, + "language_loss": 0.74900162, + "learning_rate": 4.828111578083152e-07, + "loss": 0.77634633, + "num_input_tokens_seen": 139929150, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.28710938, + "step": 6494, + "time_per_iteration": 2.971113443374634 + }, + { + "auxiliary_loss_clip": 0.01478674, + "auxiliary_loss_mlp": 0.01265769, + "balance_loss_clip": 1.12132657, + "balance_loss_mlp": 1.03726888, + "epoch": 0.7809775747008958, + "flos": 23982632136000.0, + "grad_norm": 2.9784338930257515, + "language_loss": 0.81274873, + "learning_rate": 4.823037236994556e-07, + "loss": 0.84019315, + "num_input_tokens_seen": 139947315, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.28320312, + "step": 6495, + "time_per_iteration": 2.9544944763183594 + }, + { + "auxiliary_loss_clip": 0.0143352, + "auxiliary_loss_mlp": 0.01197067, + "balance_loss_clip": 1.09434366, + "balance_loss_mlp": 1.01052856, + "epoch": 0.7810978175915348, + "flos": 68542634028960.0, + "grad_norm": 0.7186237574399307, + "language_loss": 0.56290764, + "learning_rate": 4.817965198239136e-07, + "loss": 0.58921349, + "num_input_tokens_seen": 140013775, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.86328125, + "step": 6496, + "time_per_iteration": 4.286168336868286 + }, + { + "auxiliary_loss_clip": 0.01477788, + "auxiliary_loss_mlp": 0.01269442, + "balance_loss_clip": 1.12104702, + "balance_loss_mlp": 1.04094124, + "epoch": 0.781218060482174, + "flos": 19643948631360.0, + "grad_norm": 3.8750861208897556, + "language_loss": 0.74636984, + "learning_rate": 4.812895462586331e-07, + "loss": 0.7738421, + "num_input_tokens_seen": 140031600, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.28515625, + "step": 6497, + "time_per_iteration": 3.0307912826538086 + }, + { + "auxiliary_loss_clip": 0.01478981, + "auxiliary_loss_mlp": 0.01264325, + "balance_loss_clip": 1.12276089, + "balance_loss_mlp": 1.0346806, + "epoch": 0.7813383033728131, + "flos": 25630134016320.0, + "grad_norm": 1.9381474623160182, + "language_loss": 0.82098782, + "learning_rate": 4.807828030805207e-07, + "loss": 0.84842098, + "num_input_tokens_seen": 140050590, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.29296875, + "step": 6498, + "time_per_iteration": 3.0355451107025146 + }, + { + "auxiliary_loss_clip": 0.0147716, + "auxiliary_loss_mlp": 0.0125505, + "balance_loss_clip": 1.12059736, + "balance_loss_mlp": 1.02750361, + "epoch": 0.7814585462634521, + "flos": 20488048211520.0, + "grad_norm": 2.163873165693017, + "language_loss": 0.68239319, + "learning_rate": 4.802762903664495e-07, + "loss": 0.70971531, + "num_input_tokens_seen": 140069770, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.27929688, + "step": 6499, + "time_per_iteration": 3.0431485176086426 + }, + { + "auxiliary_loss_clip": 0.0148296, + "auxiliary_loss_mlp": 0.01259956, + "balance_loss_clip": 1.12695122, + "balance_loss_mlp": 1.02935743, + "epoch": 0.7815787891540913, + "flos": 22306190705280.0, + "grad_norm": 2.302845350170384, + "language_loss": 0.74201846, + "learning_rate": 4.797700081932565e-07, + "loss": 0.76944762, + "num_input_tokens_seen": 140087635, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.3046875, + "step": 6500, + "time_per_iteration": 3.1290409564971924 + }, + { + "auxiliary_loss_clip": 0.01476541, + "auxiliary_loss_mlp": 0.01258286, + "balance_loss_clip": 1.11921453, + "balance_loss_mlp": 1.03073919, + "epoch": 0.7816990320447303, + "flos": 22602829759200.0, + "grad_norm": 2.661019397798923, + "language_loss": 0.82153779, + "learning_rate": 4.792639566377442e-07, + "loss": 0.84888601, + "num_input_tokens_seen": 140105045, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.27929688, + "step": 6501, + "time_per_iteration": 3.089104652404785 + }, + { + "auxiliary_loss_clip": 0.01474406, + "auxiliary_loss_mlp": 0.0125811, + "balance_loss_clip": 1.11802876, + "balance_loss_mlp": 1.02980042, + "epoch": 0.7818192749353694, + "flos": 24938279527680.0, + "grad_norm": 1.7830306021710731, + "language_loss": 0.77645999, + "learning_rate": 4.78758135776681e-07, + "loss": 0.80378515, + "num_input_tokens_seen": 140124900, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.28125, + "step": 6502, + "time_per_iteration": 3.190190076828003 + }, + { + "auxiliary_loss_clip": 0.01477453, + "auxiliary_loss_mlp": 0.01258341, + "balance_loss_clip": 1.12053561, + "balance_loss_mlp": 1.03022242, + "epoch": 0.7819395178260086, + "flos": 23735337982560.0, + "grad_norm": 2.437186439529511, + "language_loss": 0.79094386, + "learning_rate": 4.782525456867989e-07, + "loss": 0.81830186, + "num_input_tokens_seen": 140143755, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.27929688, + "step": 6503, + "time_per_iteration": 3.064216136932373 + }, + { + "auxiliary_loss_clip": 0.01476691, + "auxiliary_loss_mlp": 0.01262484, + "balance_loss_clip": 1.12014413, + "balance_loss_mlp": 1.03131342, + "epoch": 0.7820597607166476, + "flos": 23223909572640.0, + "grad_norm": 1.6230299584438432, + "language_loss": 0.82953429, + "learning_rate": 4.777471864447959e-07, + "loss": 0.85692596, + "num_input_tokens_seen": 140164495, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.30664062, + "step": 6504, + "time_per_iteration": 3.7602107524871826 + }, + { + "auxiliary_loss_clip": 0.01478227, + "auxiliary_loss_mlp": 0.01260087, + "balance_loss_clip": 1.1236757, + "balance_loss_mlp": 1.02872586, + "epoch": 0.7821800036072867, + "flos": 22311728216640.0, + "grad_norm": 2.4418446689974265, + "language_loss": 0.81096989, + "learning_rate": 4.772420581273344e-07, + "loss": 0.83835304, + "num_input_tokens_seen": 140181980, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 2.30859375, + "step": 6505, + "time_per_iteration": 3.0002286434173584 + }, + { + "auxiliary_loss_clip": 0.01480025, + "auxiliary_loss_mlp": 0.01255893, + "balance_loss_clip": 1.12322354, + "balance_loss_mlp": 1.02834582, + "epoch": 0.7823002464979258, + "flos": 21546368225280.0, + "grad_norm": 2.0219888779849526, + "language_loss": 0.76035088, + "learning_rate": 4.7673716081104134e-07, + "loss": 0.78771001, + "num_input_tokens_seen": 140202155, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.27539062, + "step": 6506, + "time_per_iteration": 3.005249261856079 + }, + { + "auxiliary_loss_clip": 0.01477503, + "auxiliary_loss_mlp": 0.01256765, + "balance_loss_clip": 1.12202621, + "balance_loss_mlp": 1.0288372, + "epoch": 0.7824204893885649, + "flos": 24537981719520.0, + "grad_norm": 1.8512014173540106, + "language_loss": 0.8467778, + "learning_rate": 4.762324945725109e-07, + "loss": 0.87412047, + "num_input_tokens_seen": 140221600, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.27929688, + "step": 6507, + "time_per_iteration": 3.0238094329833984 + }, + { + "auxiliary_loss_clip": 0.01479275, + "auxiliary_loss_mlp": 0.01254056, + "balance_loss_clip": 1.12330365, + "balance_loss_mlp": 1.02460217, + "epoch": 0.782540732279204, + "flos": 27417554336160.0, + "grad_norm": 1.7406465676132872, + "language_loss": 0.75906825, + "learning_rate": 4.7572805948829844e-07, + "loss": 0.78640163, + "num_input_tokens_seen": 140241860, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.29296875, + "step": 6508, + "time_per_iteration": 2.974306106567383 + }, + { + "auxiliary_loss_clip": 0.0147477, + "auxiliary_loss_mlp": 0.01259276, + "balance_loss_clip": 1.11854804, + "balance_loss_mlp": 1.03020358, + "epoch": 0.7826609751698431, + "flos": 24355545448320.0, + "grad_norm": 2.123096301313817, + "language_loss": 0.71163207, + "learning_rate": 4.7522385563492795e-07, + "loss": 0.73897254, + "num_input_tokens_seen": 140262160, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.29101562, + "step": 6509, + "time_per_iteration": 2.9216647148132324 + }, + { + "auxiliary_loss_clip": 0.01478094, + "auxiliary_loss_mlp": 0.01261541, + "balance_loss_clip": 1.12150073, + "balance_loss_mlp": 1.03151441, + "epoch": 0.7827812180604822, + "flos": 23990862546720.0, + "grad_norm": 2.168964825518593, + "language_loss": 0.70341015, + "learning_rate": 4.747198830888863e-07, + "loss": 0.73080647, + "num_input_tokens_seen": 140282030, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.296875, + "step": 6510, + "time_per_iteration": 3.013420581817627 + }, + { + "auxiliary_loss_clip": 0.01477454, + "auxiliary_loss_mlp": 0.01254661, + "balance_loss_clip": 1.1216433, + "balance_loss_mlp": 1.02692378, + "epoch": 0.7829014609511212, + "flos": 27456810346080.0, + "grad_norm": 2.348312373197577, + "language_loss": 0.68808305, + "learning_rate": 4.742161419266251e-07, + "loss": 0.71540415, + "num_input_tokens_seen": 140301190, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.27734375, + "step": 6511, + "time_per_iteration": 3.036078453063965 + }, + { + "auxiliary_loss_clip": 0.01483904, + "auxiliary_loss_mlp": 0.01272487, + "balance_loss_clip": 1.12647796, + "balance_loss_mlp": 1.04398656, + "epoch": 0.7830217038417604, + "flos": 29207060704800.0, + "grad_norm": 2.6300424895645222, + "language_loss": 0.6503216, + "learning_rate": 4.7371263222456304e-07, + "loss": 0.67788553, + "num_input_tokens_seen": 140318510, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.28320312, + "step": 6512, + "time_per_iteration": 3.0147740840911865 + }, + { + "auxiliary_loss_clip": 0.01432667, + "auxiliary_loss_mlp": 0.0119165, + "balance_loss_clip": 1.09294033, + "balance_loss_mlp": 1.00244141, + "epoch": 0.7831419467323995, + "flos": 60957457244640.0, + "grad_norm": 0.8036404137725328, + "language_loss": 0.61258382, + "learning_rate": 4.7320935405908004e-07, + "loss": 0.63882697, + "num_input_tokens_seen": 140379380, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.890625, + "step": 6513, + "time_per_iteration": 3.4700355529785156 + }, + { + "auxiliary_loss_clip": 0.01476195, + "auxiliary_loss_mlp": 0.01270449, + "balance_loss_clip": 1.11878753, + "balance_loss_mlp": 1.04099464, + "epoch": 0.7832621896230385, + "flos": 19684570055040.0, + "grad_norm": 2.1768117653468937, + "language_loss": 0.84014696, + "learning_rate": 4.7270630750652475e-07, + "loss": 0.86761338, + "num_input_tokens_seen": 140395335, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.29101562, + "step": 6514, + "time_per_iteration": 2.9505090713500977 + }, + { + "auxiliary_loss_clip": 0.01472345, + "auxiliary_loss_mlp": 0.01251911, + "balance_loss_clip": 1.11580157, + "balance_loss_mlp": 1.02436447, + "epoch": 0.7833824325136777, + "flos": 25011557461440.0, + "grad_norm": 2.2138062433331225, + "language_loss": 0.80555999, + "learning_rate": 4.7220349264320746e-07, + "loss": 0.83280259, + "num_input_tokens_seen": 140414420, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.2734375, + "step": 6515, + "time_per_iteration": 2.9894819259643555 + }, + { + "auxiliary_loss_clip": 0.01433255, + "auxiliary_loss_mlp": 0.0118959, + "balance_loss_clip": 1.09357047, + "balance_loss_mlp": 1.00190735, + "epoch": 0.7835026754043167, + "flos": 68807413064160.0, + "grad_norm": 0.7395800663364002, + "language_loss": 0.54906118, + "learning_rate": 4.71700909545407e-07, + "loss": 0.57528967, + "num_input_tokens_seen": 140477365, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 1.875, + "step": 6516, + "time_per_iteration": 3.356231689453125 + }, + { + "auxiliary_loss_clip": 0.01478969, + "auxiliary_loss_mlp": 0.01257173, + "balance_loss_clip": 1.12239587, + "balance_loss_mlp": 1.02867281, + "epoch": 0.7836229182949558, + "flos": 19866627044640.0, + "grad_norm": 2.594942461605767, + "language_loss": 0.77199292, + "learning_rate": 4.711985582893627e-07, + "loss": 0.79935431, + "num_input_tokens_seen": 140495885, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.28320312, + "step": 6517, + "time_per_iteration": 2.9036507606506348 + }, + { + "auxiliary_loss_clip": 0.01479106, + "auxiliary_loss_mlp": 0.01263327, + "balance_loss_clip": 1.1218257, + "balance_loss_mlp": 1.03368235, + "epoch": 0.783743161185595, + "flos": 22968043655040.0, + "grad_norm": 2.081463990328104, + "language_loss": 0.71760005, + "learning_rate": 4.706964389512811e-07, + "loss": 0.74502438, + "num_input_tokens_seen": 140515920, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.29492188, + "step": 6518, + "time_per_iteration": 3.925912857055664 + }, + { + "auxiliary_loss_clip": 0.01483929, + "auxiliary_loss_mlp": 0.01264169, + "balance_loss_clip": 1.12700582, + "balance_loss_mlp": 1.03776705, + "epoch": 0.783863404076234, + "flos": 12460131643680.0, + "grad_norm": 2.5404141035511985, + "language_loss": 0.87722361, + "learning_rate": 4.701945516073345e-07, + "loss": 0.90470469, + "num_input_tokens_seen": 140533395, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.26367188, + "step": 6519, + "time_per_iteration": 3.912781000137329 + }, + { + "auxiliary_loss_clip": 0.01478075, + "auxiliary_loss_mlp": 0.01260109, + "balance_loss_clip": 1.12060738, + "balance_loss_mlp": 1.03275263, + "epoch": 0.7839836469668731, + "flos": 24246273326400.0, + "grad_norm": 1.9563875390060566, + "language_loss": 0.7503351, + "learning_rate": 4.696928963336577e-07, + "loss": 0.77771699, + "num_input_tokens_seen": 140552825, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.27148438, + "step": 6520, + "time_per_iteration": 3.0559144020080566 + }, + { + "auxiliary_loss_clip": 0.01432948, + "auxiliary_loss_mlp": 0.01193481, + "balance_loss_clip": 1.09306538, + "balance_loss_mlp": 1.00465393, + "epoch": 0.7841038898575122, + "flos": 62128804268160.0, + "grad_norm": 0.852343840802665, + "language_loss": 0.6090343, + "learning_rate": 4.6919147320635224e-07, + "loss": 0.63529861, + "num_input_tokens_seen": 140615535, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.88671875, + "step": 6521, + "time_per_iteration": 3.40998911857605 + }, + { + "auxiliary_loss_clip": 0.01478329, + "auxiliary_loss_mlp": 0.0126464, + "balance_loss_clip": 1.12137902, + "balance_loss_mlp": 1.03613973, + "epoch": 0.7842241327481513, + "flos": 20195998464960.0, + "grad_norm": 2.10301181072386, + "language_loss": 0.72927624, + "learning_rate": 4.6869028230148286e-07, + "loss": 0.75670588, + "num_input_tokens_seen": 140633330, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.28515625, + "step": 6522, + "time_per_iteration": 2.9963319301605225 + }, + { + "auxiliary_loss_clip": 0.01474746, + "auxiliary_loss_mlp": 0.01256793, + "balance_loss_clip": 1.11694479, + "balance_loss_mlp": 1.0275296, + "epoch": 0.7843443756387903, + "flos": 28077055740000.0, + "grad_norm": 2.6494978746755127, + "language_loss": 0.60166341, + "learning_rate": 4.6818932369507957e-07, + "loss": 0.62897879, + "num_input_tokens_seen": 140652830, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.29492188, + "step": 6523, + "time_per_iteration": 3.0546789169311523 + }, + { + "auxiliary_loss_clip": 0.01478112, + "auxiliary_loss_mlp": 0.01257243, + "balance_loss_clip": 1.12023866, + "balance_loss_mlp": 1.02912402, + "epoch": 0.7844646185294295, + "flos": 21325472435520.0, + "grad_norm": 2.2611201874916427, + "language_loss": 0.89009416, + "learning_rate": 4.676885974631386e-07, + "loss": 0.91744769, + "num_input_tokens_seen": 140671190, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.27929688, + "step": 6524, + "time_per_iteration": 3.8817391395568848 + }, + { + "auxiliary_loss_clip": 0.01476424, + "auxiliary_loss_mlp": 0.01261154, + "balance_loss_clip": 1.11896384, + "balance_loss_mlp": 1.0315094, + "epoch": 0.7845848614200686, + "flos": 23658684442560.0, + "grad_norm": 2.0983633389002625, + "language_loss": 0.81225222, + "learning_rate": 4.67188103681619e-07, + "loss": 0.83962798, + "num_input_tokens_seen": 140690975, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.29492188, + "step": 6525, + "time_per_iteration": 3.0917770862579346 + }, + { + "auxiliary_loss_clip": 0.01472224, + "auxiliary_loss_mlp": 0.01265591, + "balance_loss_clip": 1.11477506, + "balance_loss_mlp": 1.03670883, + "epoch": 0.7847051043107076, + "flos": 23404335651360.0, + "grad_norm": 2.178892002258289, + "language_loss": 0.69329047, + "learning_rate": 4.666878424264453e-07, + "loss": 0.72066855, + "num_input_tokens_seen": 140710930, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.28710938, + "step": 6526, + "time_per_iteration": 3.0540621280670166 + }, + { + "auxiliary_loss_clip": 0.01486049, + "auxiliary_loss_mlp": 0.01250103, + "balance_loss_clip": 1.13075984, + "balance_loss_mlp": 1.02598917, + "epoch": 0.7848253472013467, + "flos": 19024082519040.0, + "grad_norm": 1.9895649802472328, + "language_loss": 0.74010968, + "learning_rate": 4.661878137735069e-07, + "loss": 0.76747119, + "num_input_tokens_seen": 140729120, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.24609375, + "step": 6527, + "time_per_iteration": 2.984362840652466 + }, + { + "auxiliary_loss_clip": 0.01477423, + "auxiliary_loss_mlp": 0.01249317, + "balance_loss_clip": 1.11907256, + "balance_loss_mlp": 1.02215195, + "epoch": 0.7849455900919858, + "flos": 21181343970240.0, + "grad_norm": 2.4925569658557127, + "language_loss": 0.74804223, + "learning_rate": 4.656880177986571e-07, + "loss": 0.77530962, + "num_input_tokens_seen": 140747665, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.27148438, + "step": 6528, + "time_per_iteration": 2.958911657333374 + }, + { + "auxiliary_loss_clip": 0.01476697, + "auxiliary_loss_mlp": 0.01259624, + "balance_loss_clip": 1.11886048, + "balance_loss_mlp": 1.03017044, + "epoch": 0.7850658329826249, + "flos": 19538507253600.0, + "grad_norm": 2.2955609436646163, + "language_loss": 0.81899166, + "learning_rate": 4.6518845457771607e-07, + "loss": 0.84635496, + "num_input_tokens_seen": 140766525, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.29296875, + "step": 6529, + "time_per_iteration": 2.9219412803649902 + }, + { + "auxiliary_loss_clip": 0.01478199, + "auxiliary_loss_mlp": 0.01256972, + "balance_loss_clip": 1.12054253, + "balance_loss_mlp": 1.02637374, + "epoch": 0.7851860758732639, + "flos": 12496998179520.0, + "grad_norm": 2.059527791839123, + "language_loss": 0.79110587, + "learning_rate": 4.646891241864652e-07, + "loss": 0.8184576, + "num_input_tokens_seen": 140785090, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.30273438, + "step": 6530, + "time_per_iteration": 3.045135259628296 + }, + { + "auxiliary_loss_clip": 0.01477727, + "auxiliary_loss_mlp": 0.01267351, + "balance_loss_clip": 1.12003183, + "balance_loss_mlp": 1.03541756, + "epoch": 0.7853063187639031, + "flos": 22962885425280.0, + "grad_norm": 1.9621544252091572, + "language_loss": 0.7309556, + "learning_rate": 4.6419002670065397e-07, + "loss": 0.7584064, + "num_input_tokens_seen": 140804670, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.31640625, + "step": 6531, + "time_per_iteration": 2.976933002471924 + }, + { + "auxiliary_loss_clip": 0.01479322, + "auxiliary_loss_mlp": 0.01260003, + "balance_loss_clip": 1.1228931, + "balance_loss_mlp": 1.03074026, + "epoch": 0.7854265616545422, + "flos": 17349158214720.0, + "grad_norm": 1.9696558547239456, + "language_loss": 0.86855495, + "learning_rate": 4.6369116219599445e-07, + "loss": 0.89594817, + "num_input_tokens_seen": 140820655, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.29101562, + "step": 6532, + "time_per_iteration": 3.825200080871582 + }, + { + "auxiliary_loss_clip": 0.01472363, + "auxiliary_loss_mlp": 0.01253382, + "balance_loss_clip": 1.11707044, + "balance_loss_mlp": 1.02430952, + "epoch": 0.7855468045451812, + "flos": 23840362150560.0, + "grad_norm": 1.637270031742235, + "language_loss": 0.79418349, + "learning_rate": 4.631925307481637e-07, + "loss": 0.82144099, + "num_input_tokens_seen": 140840470, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.29101562, + "step": 6533, + "time_per_iteration": 3.0338094234466553 + }, + { + "auxiliary_loss_clip": 0.01480227, + "auxiliary_loss_mlp": 0.01263249, + "balance_loss_clip": 1.12481737, + "balance_loss_mlp": 1.03646517, + "epoch": 0.7856670474358204, + "flos": 25669200385440.0, + "grad_norm": 2.289137778477607, + "language_loss": 0.76029563, + "learning_rate": 4.6269413243280533e-07, + "loss": 0.7877304, + "num_input_tokens_seen": 140859890, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.26757812, + "step": 6534, + "time_per_iteration": 2.9204518795013428 + }, + { + "auxiliary_loss_clip": 0.01482951, + "auxiliary_loss_mlp": 0.012606, + "balance_loss_clip": 1.12572968, + "balance_loss_mlp": 1.03076434, + "epoch": 0.7857872903264594, + "flos": 18146302368480.0, + "grad_norm": 3.7631471468330444, + "language_loss": 0.73966539, + "learning_rate": 4.621959673255236e-07, + "loss": 0.76710081, + "num_input_tokens_seen": 140876190, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.29882812, + "step": 6535, + "time_per_iteration": 2.9626717567443848 + }, + { + "auxiliary_loss_clip": 0.01479798, + "auxiliary_loss_mlp": 0.01256431, + "balance_loss_clip": 1.12291908, + "balance_loss_mlp": 1.03060079, + "epoch": 0.7859075332170985, + "flos": 14387546259360.0, + "grad_norm": 2.809658277336593, + "language_loss": 0.9015128, + "learning_rate": 4.6169803550189135e-07, + "loss": 0.92887509, + "num_input_tokens_seen": 140891885, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.25976562, + "step": 6536, + "time_per_iteration": 3.0955650806427 + }, + { + "auxiliary_loss_clip": 0.01481699, + "auxiliary_loss_mlp": 0.01268337, + "balance_loss_clip": 1.12439811, + "balance_loss_mlp": 1.0377388, + "epoch": 0.7860277761077377, + "flos": 19866058122240.0, + "grad_norm": 7.638107456344034, + "language_loss": 0.77384502, + "learning_rate": 4.6120033703744355e-07, + "loss": 0.80134535, + "num_input_tokens_seen": 140910780, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.30273438, + "step": 6537, + "time_per_iteration": 3.02323055267334 + }, + { + "auxiliary_loss_clip": 0.01476516, + "auxiliary_loss_mlp": 0.01266264, + "balance_loss_clip": 1.12073207, + "balance_loss_mlp": 1.03890836, + "epoch": 0.7861480189983767, + "flos": 26398604116800.0, + "grad_norm": 2.6207446386517717, + "language_loss": 0.7829994, + "learning_rate": 4.607028720076822e-07, + "loss": 0.81042719, + "num_input_tokens_seen": 140927460, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.2734375, + "step": 6538, + "time_per_iteration": 3.0191798210144043 + }, + { + "auxiliary_loss_clip": 0.01481502, + "auxiliary_loss_mlp": 0.01261785, + "balance_loss_clip": 1.124107, + "balance_loss_mlp": 1.03442919, + "epoch": 0.7862682618890158, + "flos": 24238498053600.0, + "grad_norm": 3.148843977437568, + "language_loss": 0.73913074, + "learning_rate": 4.6020564048807074e-07, + "loss": 0.76656365, + "num_input_tokens_seen": 140945135, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.27148438, + "step": 6539, + "time_per_iteration": 2.985799789428711 + }, + { + "auxiliary_loss_clip": 0.01478506, + "auxiliary_loss_mlp": 0.01257628, + "balance_loss_clip": 1.12320399, + "balance_loss_mlp": 1.02760124, + "epoch": 0.7863885047796549, + "flos": 47554763788800.0, + "grad_norm": 2.2609550069501574, + "language_loss": 0.71701545, + "learning_rate": 4.5970864255403883e-07, + "loss": 0.74437678, + "num_input_tokens_seen": 140966660, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.296875, + "step": 6540, + "time_per_iteration": 3.2658896446228027 + }, + { + "auxiliary_loss_clip": 0.01473493, + "auxiliary_loss_mlp": 0.01249859, + "balance_loss_clip": 1.11468554, + "balance_loss_mlp": 1.02460098, + "epoch": 0.786508747670294, + "flos": 24391539636480.0, + "grad_norm": 1.8870118306781822, + "language_loss": 0.82354784, + "learning_rate": 4.59211878280982e-07, + "loss": 0.85078132, + "num_input_tokens_seen": 140986175, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 2.25585938, + "step": 6541, + "time_per_iteration": 3.0639162063598633 + }, + { + "auxiliary_loss_clip": 0.01478858, + "auxiliary_loss_mlp": 0.01256675, + "balance_loss_clip": 1.12203228, + "balance_loss_mlp": 1.02893758, + "epoch": 0.786628990560933, + "flos": 18043212536640.0, + "grad_norm": 2.665059812571677, + "language_loss": 0.70520586, + "learning_rate": 4.587153477442578e-07, + "loss": 0.73256117, + "num_input_tokens_seen": 141002490, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.27734375, + "step": 6542, + "time_per_iteration": 2.960655927658081 + }, + { + "auxiliary_loss_clip": 0.01480728, + "auxiliary_loss_mlp": 0.01257764, + "balance_loss_clip": 1.12311625, + "balance_loss_mlp": 1.02697492, + "epoch": 0.7867492334515722, + "flos": 25851257375040.0, + "grad_norm": 8.215272950969696, + "language_loss": 0.81567103, + "learning_rate": 4.582190510191899e-07, + "loss": 0.84305596, + "num_input_tokens_seen": 141021150, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.30664062, + "step": 6543, + "time_per_iteration": 3.0607666969299316 + }, + { + "auxiliary_loss_clip": 0.01473687, + "auxiliary_loss_mlp": 0.01246555, + "balance_loss_clip": 1.11682022, + "balance_loss_mlp": 1.01919866, + "epoch": 0.7868694763422113, + "flos": 16582660378560.0, + "grad_norm": 2.3565911936722124, + "language_loss": 0.86923414, + "learning_rate": 4.5772298818106625e-07, + "loss": 0.89643663, + "num_input_tokens_seen": 141036940, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.2734375, + "step": 6544, + "time_per_iteration": 2.9547293186187744 + }, + { + "auxiliary_loss_clip": 0.01484426, + "auxiliary_loss_mlp": 0.01264511, + "balance_loss_clip": 1.12687111, + "balance_loss_mlp": 1.03543818, + "epoch": 0.7869897192328503, + "flos": 29388776340960.0, + "grad_norm": 3.5035500610192747, + "language_loss": 0.72054011, + "learning_rate": 4.572271593051384e-07, + "loss": 0.74802947, + "num_input_tokens_seen": 141054295, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.29296875, + "step": 6545, + "time_per_iteration": 3.0419366359710693 + }, + { + "auxiliary_loss_clip": 0.0147491, + "auxiliary_loss_mlp": 0.0126299, + "balance_loss_clip": 1.11713552, + "balance_loss_mlp": 1.03372693, + "epoch": 0.7871099621234895, + "flos": 17130841539840.0, + "grad_norm": 1.649086207093302, + "language_loss": 0.77992105, + "learning_rate": 4.567315644666245e-07, + "loss": 0.80730009, + "num_input_tokens_seen": 141073090, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.29296875, + "step": 6546, + "time_per_iteration": 3.8577940464019775 + }, + { + "auxiliary_loss_clip": 0.01480409, + "auxiliary_loss_mlp": 0.01259994, + "balance_loss_clip": 1.12451911, + "balance_loss_mlp": 1.03435469, + "epoch": 0.7872302050141285, + "flos": 23442415888320.0, + "grad_norm": 2.4726037504084433, + "language_loss": 0.85018677, + "learning_rate": 4.5623620374070507e-07, + "loss": 0.87759084, + "num_input_tokens_seen": 141092405, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.25976562, + "step": 6547, + "time_per_iteration": 3.9098267555236816 + }, + { + "auxiliary_loss_clip": 0.01430688, + "auxiliary_loss_mlp": 0.0119326, + "balance_loss_clip": 1.09182966, + "balance_loss_mlp": 1.0067215, + "epoch": 0.7873504479047676, + "flos": 65967172313760.0, + "grad_norm": 0.764911952222436, + "language_loss": 0.58300418, + "learning_rate": 4.557410772025263e-07, + "loss": 0.60924369, + "num_input_tokens_seen": 141154355, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.86328125, + "step": 6548, + "time_per_iteration": 3.4856390953063965 + }, + { + "auxiliary_loss_clip": 0.01480073, + "auxiliary_loss_mlp": 0.0126383, + "balance_loss_clip": 1.12558556, + "balance_loss_mlp": 1.03342247, + "epoch": 0.7874706907954068, + "flos": 23260586467680.0, + "grad_norm": 1.8497591187243279, + "language_loss": 0.66171807, + "learning_rate": 4.5524618492719803e-07, + "loss": 0.68915713, + "num_input_tokens_seen": 141173575, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 2.30078125, + "step": 6549, + "time_per_iteration": 3.007599115371704 + }, + { + "auxiliary_loss_clip": 0.01473922, + "auxiliary_loss_mlp": 0.01263244, + "balance_loss_clip": 1.11667717, + "balance_loss_mlp": 1.03474307, + "epoch": 0.7875909336860458, + "flos": 28770351498720.0, + "grad_norm": 1.649560513605006, + "language_loss": 0.78783524, + "learning_rate": 4.54751526989795e-07, + "loss": 0.81520683, + "num_input_tokens_seen": 141195415, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.28515625, + "step": 6550, + "time_per_iteration": 3.004197835922241 + }, + { + "auxiliary_loss_clip": 0.01477536, + "auxiliary_loss_mlp": 0.01260921, + "balance_loss_clip": 1.12158787, + "balance_loss_mlp": 1.03280187, + "epoch": 0.7877111765766849, + "flos": 18699262477920.0, + "grad_norm": 4.198098086337716, + "language_loss": 0.79141641, + "learning_rate": 4.5425710346535775e-07, + "loss": 0.81880093, + "num_input_tokens_seen": 141213360, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.28320312, + "step": 6551, + "time_per_iteration": 3.7520647048950195 + }, + { + "auxiliary_loss_clip": 0.01472163, + "auxiliary_loss_mlp": 0.01257507, + "balance_loss_clip": 1.11432815, + "balance_loss_mlp": 1.03186762, + "epoch": 0.787831419467324, + "flos": 27595059946560.0, + "grad_norm": 2.137123706741998, + "language_loss": 0.81622845, + "learning_rate": 4.537629144288877e-07, + "loss": 0.84352511, + "num_input_tokens_seen": 141230815, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.26171875, + "step": 6552, + "time_per_iteration": 3.0387823581695557 + }, + { + "auxiliary_loss_clip": 0.01478209, + "auxiliary_loss_mlp": 0.01264142, + "balance_loss_clip": 1.12247777, + "balance_loss_mlp": 1.03316236, + "epoch": 0.7879516623579631, + "flos": 18152143305120.0, + "grad_norm": 2.194583122178163, + "language_loss": 0.75058651, + "learning_rate": 4.5326895995535477e-07, + "loss": 0.77801001, + "num_input_tokens_seen": 141249715, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.30664062, + "step": 6553, + "time_per_iteration": 2.9684062004089355 + }, + { + "auxiliary_loss_clip": 0.0147093, + "auxiliary_loss_mlp": 0.012485, + "balance_loss_clip": 1.11307573, + "balance_loss_mlp": 1.02190709, + "epoch": 0.7880719052486022, + "flos": 20341264775040.0, + "grad_norm": 2.73361475667788, + "language_loss": 0.8396247, + "learning_rate": 4.527752401196907e-07, + "loss": 0.86681902, + "num_input_tokens_seen": 141267730, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.26757812, + "step": 6554, + "time_per_iteration": 2.9906837940216064 + }, + { + "auxiliary_loss_clip": 0.01479479, + "auxiliary_loss_mlp": 0.01265355, + "balance_loss_clip": 1.12272573, + "balance_loss_mlp": 1.03399324, + "epoch": 0.7881921481392413, + "flos": 21655412778240.0, + "grad_norm": 2.058911252258634, + "language_loss": 0.66723228, + "learning_rate": 4.5228175499679254e-07, + "loss": 0.69468063, + "num_input_tokens_seen": 141287315, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.30859375, + "step": 6555, + "time_per_iteration": 2.9920167922973633 + }, + { + "auxiliary_loss_clip": 0.01431424, + "auxiliary_loss_mlp": 0.01198013, + "balance_loss_clip": 1.09192276, + "balance_loss_mlp": 1.00956726, + "epoch": 0.7883123910298804, + "flos": 68572256286240.0, + "grad_norm": 0.838732531628231, + "language_loss": 0.54514533, + "learning_rate": 4.5178850466152174e-07, + "loss": 0.57143974, + "num_input_tokens_seen": 141346145, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 1.8828125, + "step": 6556, + "time_per_iteration": 3.47286057472229 + }, + { + "auxiliary_loss_clip": 0.01478284, + "auxiliary_loss_mlp": 0.01254873, + "balance_loss_clip": 1.12214839, + "balance_loss_mlp": 1.0269444, + "epoch": 0.7884326339205194, + "flos": 19320266435040.0, + "grad_norm": 1.9810538630912877, + "language_loss": 0.82612002, + "learning_rate": 4.512954891887031e-07, + "loss": 0.85345161, + "num_input_tokens_seen": 141364445, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.27734375, + "step": 6557, + "time_per_iteration": 3.0035271644592285 + }, + { + "auxiliary_loss_clip": 0.01472156, + "auxiliary_loss_mlp": 0.01259765, + "balance_loss_clip": 1.11480021, + "balance_loss_mlp": 1.03088307, + "epoch": 0.7885528768111585, + "flos": 17786815624800.0, + "grad_norm": 2.722314530915683, + "language_loss": 0.83559167, + "learning_rate": 4.5080270865312806e-07, + "loss": 0.86291087, + "num_input_tokens_seen": 141381640, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.28515625, + "step": 6558, + "time_per_iteration": 3.008157253265381 + }, + { + "auxiliary_loss_clip": 0.01474631, + "auxiliary_loss_mlp": 0.0125508, + "balance_loss_clip": 1.11837673, + "balance_loss_mlp": 1.02562594, + "epoch": 0.7886731197017977, + "flos": 18809672444640.0, + "grad_norm": 2.7790863545903313, + "language_loss": 0.71272737, + "learning_rate": 4.5031016312954985e-07, + "loss": 0.74002451, + "num_input_tokens_seen": 141399955, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.29296875, + "step": 6559, + "time_per_iteration": 3.194467306137085 + }, + { + "auxiliary_loss_clip": 0.0148061, + "auxiliary_loss_mlp": 0.01261639, + "balance_loss_clip": 1.12468302, + "balance_loss_mlp": 1.02856135, + "epoch": 0.7887933625924367, + "flos": 33368200670880.0, + "grad_norm": 2.0637121370734466, + "language_loss": 0.74429494, + "learning_rate": 4.498178526926886e-07, + "loss": 0.77171743, + "num_input_tokens_seen": 141420820, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.32617188, + "step": 6560, + "time_per_iteration": 3.934675693511963 + }, + { + "auxiliary_loss_clip": 0.01475563, + "auxiliary_loss_mlp": 0.01261771, + "balance_loss_clip": 1.117594, + "balance_loss_mlp": 1.03918386, + "epoch": 0.7889136054830758, + "flos": 17021265992640.0, + "grad_norm": 2.362230579281097, + "language_loss": 0.72431904, + "learning_rate": 4.4932577741722635e-07, + "loss": 0.75169235, + "num_input_tokens_seen": 141439350, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.22851562, + "step": 6561, + "time_per_iteration": 3.0018303394317627 + }, + { + "auxiliary_loss_clip": 0.01474723, + "auxiliary_loss_mlp": 0.0125672, + "balance_loss_clip": 1.11655653, + "balance_loss_mlp": 1.02879143, + "epoch": 0.7890338483737149, + "flos": 29427235859520.0, + "grad_norm": 1.7504001022867968, + "language_loss": 0.74303752, + "learning_rate": 4.4883393737780985e-07, + "loss": 0.77035195, + "num_input_tokens_seen": 141460300, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.27929688, + "step": 6562, + "time_per_iteration": 2.996605634689331 + }, + { + "auxiliary_loss_clip": 0.01471464, + "auxiliary_loss_mlp": 0.01253676, + "balance_loss_clip": 1.11305213, + "balance_loss_mlp": 1.02517581, + "epoch": 0.789154091264354, + "flos": 19973471764320.0, + "grad_norm": 1.9517185092553875, + "language_loss": 0.78622913, + "learning_rate": 4.4834233264905254e-07, + "loss": 0.8134805, + "num_input_tokens_seen": 141477315, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 2.28515625, + "step": 6563, + "time_per_iteration": 3.1101043224334717 + }, + { + "auxiliary_loss_clip": 0.01482854, + "auxiliary_loss_mlp": 0.01263988, + "balance_loss_clip": 1.12596059, + "balance_loss_mlp": 1.03224564, + "epoch": 0.789274334154993, + "flos": 14539563781920.0, + "grad_norm": 2.684467325810253, + "language_loss": 0.71926123, + "learning_rate": 4.478509633055294e-07, + "loss": 0.74672967, + "num_input_tokens_seen": 141495025, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.3125, + "step": 6564, + "time_per_iteration": 2.9093525409698486 + }, + { + "auxiliary_loss_clip": 0.01476766, + "auxiliary_loss_mlp": 0.01276218, + "balance_loss_clip": 1.11739111, + "balance_loss_mlp": 1.04771805, + "epoch": 0.7893945770456322, + "flos": 21829125572640.0, + "grad_norm": 5.999601536406313, + "language_loss": 0.80423266, + "learning_rate": 4.473598294217813e-07, + "loss": 0.83176249, + "num_input_tokens_seen": 141510450, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 2.28320312, + "step": 6565, + "time_per_iteration": 2.966071128845215 + }, + { + "auxiliary_loss_clip": 0.01474106, + "auxiliary_loss_mlp": 0.01255812, + "balance_loss_clip": 1.11711872, + "balance_loss_mlp": 1.03093505, + "epoch": 0.7895148199362713, + "flos": 20742738356160.0, + "grad_norm": 2.473846467842903, + "language_loss": 0.71693468, + "learning_rate": 4.468689310723124e-07, + "loss": 0.74423385, + "num_input_tokens_seen": 141528265, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.25, + "step": 6566, + "time_per_iteration": 2.971496105194092 + }, + { + "auxiliary_loss_clip": 0.01474658, + "auxiliary_loss_mlp": 0.01253121, + "balance_loss_clip": 1.11811101, + "balance_loss_mlp": 1.02652776, + "epoch": 0.7896350628269103, + "flos": 16692539351040.0, + "grad_norm": 1.7491734193014037, + "language_loss": 0.78546607, + "learning_rate": 4.463782683315913e-07, + "loss": 0.8127439, + "num_input_tokens_seen": 141547270, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.26953125, + "step": 6567, + "time_per_iteration": 2.9198107719421387 + }, + { + "auxiliary_loss_clip": 0.01475718, + "auxiliary_loss_mlp": 0.01255028, + "balance_loss_clip": 1.11886156, + "balance_loss_mlp": 1.02671814, + "epoch": 0.7897553057175495, + "flos": 22640606570880.0, + "grad_norm": 1.8184610265943926, + "language_loss": 0.73142636, + "learning_rate": 4.458878412740523e-07, + "loss": 0.75873387, + "num_input_tokens_seen": 141566050, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.28125, + "step": 6568, + "time_per_iteration": 2.977189540863037 + }, + { + "auxiliary_loss_clip": 0.0148011, + "auxiliary_loss_mlp": 0.01256353, + "balance_loss_clip": 1.12340045, + "balance_loss_mlp": 1.02499163, + "epoch": 0.7898755486081885, + "flos": 14539298284800.0, + "grad_norm": 2.7845557782173915, + "language_loss": 0.78639174, + "learning_rate": 4.453976499740919e-07, + "loss": 0.81375647, + "num_input_tokens_seen": 141583695, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.31054688, + "step": 6569, + "time_per_iteration": 2.9497225284576416 + }, + { + "auxiliary_loss_clip": 0.01476297, + "auxiliary_loss_mlp": 0.01249479, + "balance_loss_clip": 1.11933446, + "balance_loss_mlp": 1.02288628, + "epoch": 0.7899957914988276, + "flos": 17240417087040.0, + "grad_norm": 2.727843915169166, + "language_loss": 0.77990049, + "learning_rate": 4.4490769450607215e-07, + "loss": 0.80715823, + "num_input_tokens_seen": 141601320, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.26757812, + "step": 6570, + "time_per_iteration": 2.9771294593811035 + }, + { + "auxiliary_loss_clip": 0.01473487, + "auxiliary_loss_mlp": 0.01261626, + "balance_loss_clip": 1.11592984, + "balance_loss_mlp": 1.03293538, + "epoch": 0.7901160343894668, + "flos": 41282445450240.0, + "grad_norm": 1.9640385337446529, + "language_loss": 0.72862077, + "learning_rate": 4.4441797494431845e-07, + "loss": 0.75597191, + "num_input_tokens_seen": 141623125, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.28710938, + "step": 6571, + "time_per_iteration": 3.1607272624969482 + }, + { + "auxiliary_loss_clip": 0.01475477, + "auxiliary_loss_mlp": 0.01257098, + "balance_loss_clip": 1.11787033, + "balance_loss_mlp": 1.03012347, + "epoch": 0.7902362772801058, + "flos": 16838905577760.0, + "grad_norm": 2.1027772699694487, + "language_loss": 0.77640134, + "learning_rate": 4.439284913631207e-07, + "loss": 0.80372709, + "num_input_tokens_seen": 141640335, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.27148438, + "step": 6572, + "time_per_iteration": 3.000589609146118 + }, + { + "auxiliary_loss_clip": 0.01478375, + "auxiliary_loss_mlp": 0.01257165, + "balance_loss_clip": 1.12229276, + "balance_loss_mlp": 1.02847409, + "epoch": 0.7903565201707449, + "flos": 27128462986080.0, + "grad_norm": 2.4217960535323892, + "language_loss": 0.84512132, + "learning_rate": 4.434392438367347e-07, + "loss": 0.87247682, + "num_input_tokens_seen": 141659760, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.2890625, + "step": 6573, + "time_per_iteration": 3.823303699493408 + }, + { + "auxiliary_loss_clip": 0.01475529, + "auxiliary_loss_mlp": 0.01257509, + "balance_loss_clip": 1.11824036, + "balance_loss_mlp": 1.02862704, + "epoch": 0.790476763061384, + "flos": 31027592672640.0, + "grad_norm": 2.4570132127561193, + "language_loss": 0.74356067, + "learning_rate": 4.4295023243937677e-07, + "loss": 0.77089107, + "num_input_tokens_seen": 141679965, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.28710938, + "step": 6574, + "time_per_iteration": 3.069716453552246 + }, + { + "auxiliary_loss_clip": 0.01480592, + "auxiliary_loss_mlp": 0.01262227, + "balance_loss_clip": 1.12299943, + "balance_loss_mlp": 1.03372633, + "epoch": 0.7905970059520231, + "flos": 22091173780320.0, + "grad_norm": 2.28173429117699, + "language_loss": 0.80308366, + "learning_rate": 4.4246145724523123e-07, + "loss": 0.83051181, + "num_input_tokens_seen": 141697710, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.28515625, + "step": 6575, + "time_per_iteration": 3.8364593982696533 + }, + { + "auxiliary_loss_clip": 0.01474119, + "auxiliary_loss_mlp": 0.01252945, + "balance_loss_clip": 1.11694813, + "balance_loss_mlp": 1.02616119, + "epoch": 0.7907172488426621, + "flos": 20560226228640.0, + "grad_norm": 2.2346422289191676, + "language_loss": 0.77275515, + "learning_rate": 4.41972918328444e-07, + "loss": 0.80002582, + "num_input_tokens_seen": 141715145, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.26953125, + "step": 6576, + "time_per_iteration": 3.0066397190093994 + }, + { + "auxiliary_loss_clip": 0.01478902, + "auxiliary_loss_mlp": 0.01258913, + "balance_loss_clip": 1.12203026, + "balance_loss_mlp": 1.03098488, + "epoch": 0.7908374917333013, + "flos": 30084196076640.0, + "grad_norm": 2.6254704449598307, + "language_loss": 0.77512926, + "learning_rate": 4.4148461576312646e-07, + "loss": 0.8025074, + "num_input_tokens_seen": 141734810, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.27734375, + "step": 6577, + "time_per_iteration": 3.0581982135772705 + }, + { + "auxiliary_loss_clip": 0.01478481, + "auxiliary_loss_mlp": 0.01259097, + "balance_loss_clip": 1.12275219, + "balance_loss_mlp": 1.03155029, + "epoch": 0.7909577346239404, + "flos": 20998490489280.0, + "grad_norm": 1.6852696535152865, + "language_loss": 0.74796903, + "learning_rate": 4.4099654962335343e-07, + "loss": 0.77534473, + "num_input_tokens_seen": 141755260, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.27539062, + "step": 6578, + "time_per_iteration": 3.9583804607391357 + }, + { + "auxiliary_loss_clip": 0.01476175, + "auxiliary_loss_mlp": 0.01256113, + "balance_loss_clip": 1.11882234, + "balance_loss_mlp": 1.02608716, + "epoch": 0.7910779775145794, + "flos": 26250227697600.0, + "grad_norm": 3.063377514660743, + "language_loss": 0.75140822, + "learning_rate": 4.405087199831636e-07, + "loss": 0.77873111, + "num_input_tokens_seen": 141775500, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.29882812, + "step": 6579, + "time_per_iteration": 3.0892364978790283 + }, + { + "auxiliary_loss_clip": 0.01470566, + "auxiliary_loss_mlp": 0.01254317, + "balance_loss_clip": 1.11433363, + "balance_loss_mlp": 1.02562582, + "epoch": 0.7911982204052186, + "flos": 22566608002080.0, + "grad_norm": 2.4724075983605727, + "language_loss": 0.67213321, + "learning_rate": 4.400211269165619e-07, + "loss": 0.69938207, + "num_input_tokens_seen": 141791955, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.28515625, + "step": 6580, + "time_per_iteration": 3.0527849197387695 + }, + { + "auxiliary_loss_clip": 0.01483083, + "auxiliary_loss_mlp": 0.0125486, + "balance_loss_clip": 1.12868142, + "balance_loss_mlp": 1.02731287, + "epoch": 0.7913184632958576, + "flos": 23114789163360.0, + "grad_norm": 1.5313301032729199, + "language_loss": 0.76863313, + "learning_rate": 4.3953377049751416e-07, + "loss": 0.79601252, + "num_input_tokens_seen": 141812380, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 2.27734375, + "step": 6581, + "time_per_iteration": 3.0744404792785645 + }, + { + "auxiliary_loss_clip": 0.01476894, + "auxiliary_loss_mlp": 0.01265244, + "balance_loss_clip": 1.12212324, + "balance_loss_mlp": 1.03788805, + "epoch": 0.7914387061864967, + "flos": 12313348207200.0, + "grad_norm": 3.45340447843336, + "language_loss": 0.78036845, + "learning_rate": 4.390466507999537e-07, + "loss": 0.8077898, + "num_input_tokens_seen": 141828130, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.2734375, + "step": 6582, + "time_per_iteration": 3.1150147914886475 + }, + { + "auxiliary_loss_clip": 0.01476778, + "auxiliary_loss_mlp": 0.01259346, + "balance_loss_clip": 1.11958277, + "balance_loss_mlp": 1.0295105, + "epoch": 0.7915589490771359, + "flos": 17605213773120.0, + "grad_norm": 7.691425968787072, + "language_loss": 0.76151305, + "learning_rate": 4.385597678977748e-07, + "loss": 0.78887433, + "num_input_tokens_seen": 141846965, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.29492188, + "step": 6583, + "time_per_iteration": 3.0171701908111572 + }, + { + "auxiliary_loss_clip": 0.0147137, + "auxiliary_loss_mlp": 0.01254083, + "balance_loss_clip": 1.11434042, + "balance_loss_mlp": 1.02615511, + "epoch": 0.7916791919677749, + "flos": 25593532977600.0, + "grad_norm": 1.9552845346185757, + "language_loss": 0.75501543, + "learning_rate": 4.3807312186483726e-07, + "loss": 0.78226995, + "num_input_tokens_seen": 141867685, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.27929688, + "step": 6584, + "time_per_iteration": 3.0207252502441406 + }, + { + "auxiliary_loss_clip": 0.01481465, + "auxiliary_loss_mlp": 0.01256353, + "balance_loss_clip": 1.124681, + "balance_loss_mlp": 1.02575493, + "epoch": 0.791799434858414, + "flos": 18846387267840.0, + "grad_norm": 2.269789327868006, + "language_loss": 0.78321099, + "learning_rate": 4.375867127749655e-07, + "loss": 0.81058919, + "num_input_tokens_seen": 141885960, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.30273438, + "step": 6585, + "time_per_iteration": 2.9910285472869873 + }, + { + "auxiliary_loss_clip": 0.01474126, + "auxiliary_loss_mlp": 0.01259171, + "balance_loss_clip": 1.11739159, + "balance_loss_mlp": 1.03086162, + "epoch": 0.7919196777490531, + "flos": 25814390839200.0, + "grad_norm": 1.9119040355705503, + "language_loss": 0.67264569, + "learning_rate": 4.3710054070194744e-07, + "loss": 0.69997865, + "num_input_tokens_seen": 141905655, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.28125, + "step": 6586, + "time_per_iteration": 2.9344899654388428 + }, + { + "auxiliary_loss_clip": 0.01471735, + "auxiliary_loss_mlp": 0.01260523, + "balance_loss_clip": 1.11548638, + "balance_loss_mlp": 1.02935219, + "epoch": 0.7920399206396922, + "flos": 11949234228000.0, + "grad_norm": 2.978567051800974, + "language_loss": 0.67058587, + "learning_rate": 4.3661460571953455e-07, + "loss": 0.69790846, + "num_input_tokens_seen": 141922390, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.31054688, + "step": 6587, + "time_per_iteration": 2.9719817638397217 + }, + { + "auxiliary_loss_clip": 0.01475503, + "auxiliary_loss_mlp": 0.01250648, + "balance_loss_clip": 1.11982501, + "balance_loss_mlp": 1.02176666, + "epoch": 0.7921601635303313, + "flos": 21582058988160.0, + "grad_norm": 4.787083699189442, + "language_loss": 0.68586588, + "learning_rate": 4.36128907901443e-07, + "loss": 0.71312737, + "num_input_tokens_seen": 141941985, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.2890625, + "step": 6588, + "time_per_iteration": 3.811424970626831 + }, + { + "auxiliary_loss_clip": 0.01483568, + "auxiliary_loss_mlp": 0.01253126, + "balance_loss_clip": 1.12893844, + "balance_loss_mlp": 1.02386284, + "epoch": 0.7922804064209703, + "flos": 18115959476160.0, + "grad_norm": 2.7733613963317287, + "language_loss": 0.72945082, + "learning_rate": 4.356434473213519e-07, + "loss": 0.75681776, + "num_input_tokens_seen": 141959435, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 2.29296875, + "step": 6589, + "time_per_iteration": 2.9519834518432617 + }, + { + "auxiliary_loss_clip": 0.01474847, + "auxiliary_loss_mlp": 0.01261287, + "balance_loss_clip": 1.11875415, + "balance_loss_mlp": 1.0316422, + "epoch": 0.7924006493116095, + "flos": 21654957640320.0, + "grad_norm": 2.1082236081276013, + "language_loss": 0.79727268, + "learning_rate": 4.351582240529068e-07, + "loss": 0.82463408, + "num_input_tokens_seen": 141980265, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.29492188, + "step": 6590, + "time_per_iteration": 2.977978467941284 + }, + { + "auxiliary_loss_clip": 0.01429517, + "auxiliary_loss_mlp": 0.01195839, + "balance_loss_clip": 1.0899241, + "balance_loss_mlp": 1.00853729, + "epoch": 0.7925208922022485, + "flos": 64249957746720.0, + "grad_norm": 0.6846230161815012, + "language_loss": 0.58130366, + "learning_rate": 4.346732381697149e-07, + "loss": 0.60755718, + "num_input_tokens_seen": 142044395, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 1.87109375, + "step": 6591, + "time_per_iteration": 3.44756817817688 + }, + { + "auxiliary_loss_clip": 0.01475098, + "auxiliary_loss_mlp": 0.01255158, + "balance_loss_clip": 1.11931717, + "balance_loss_mlp": 1.02570426, + "epoch": 0.7926411350928876, + "flos": 16943171182560.0, + "grad_norm": 1.8791240509837392, + "language_loss": 0.81670964, + "learning_rate": 4.3418848974534825e-07, + "loss": 0.8440122, + "num_input_tokens_seen": 142061335, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 2.29101562, + "step": 6592, + "time_per_iteration": 2.9696505069732666 + }, + { + "auxiliary_loss_clip": 0.01479534, + "auxiliary_loss_mlp": 0.01255773, + "balance_loss_clip": 1.1239903, + "balance_loss_mlp": 1.02898955, + "epoch": 0.7927613779835267, + "flos": 34462894154400.0, + "grad_norm": 1.6790516068936616, + "language_loss": 0.68780577, + "learning_rate": 4.3370397885334276e-07, + "loss": 0.71515888, + "num_input_tokens_seen": 142081965, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.26953125, + "step": 6593, + "time_per_iteration": 3.2013046741485596 + }, + { + "auxiliary_loss_clip": 0.01478821, + "auxiliary_loss_mlp": 0.0125505, + "balance_loss_clip": 1.12360573, + "balance_loss_mlp": 1.0255959, + "epoch": 0.7928816208741658, + "flos": 18953080274880.0, + "grad_norm": 2.1437621266712936, + "language_loss": 0.75707114, + "learning_rate": 4.3321970556719777e-07, + "loss": 0.78440982, + "num_input_tokens_seen": 142100260, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.29101562, + "step": 6594, + "time_per_iteration": 3.1123034954071045 + }, + { + "auxiliary_loss_clip": 0.01479586, + "auxiliary_loss_mlp": 0.01260909, + "balance_loss_clip": 1.12253547, + "balance_loss_mlp": 1.03279042, + "epoch": 0.7930018637648049, + "flos": 18624922555680.0, + "grad_norm": 9.687393615062213, + "language_loss": 0.72232229, + "learning_rate": 4.3273566996037856e-07, + "loss": 0.74972725, + "num_input_tokens_seen": 142116955, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.28125, + "step": 6595, + "time_per_iteration": 3.0992624759674072 + }, + { + "auxiliary_loss_clip": 0.01472454, + "auxiliary_loss_mlp": 0.0125123, + "balance_loss_clip": 1.11609554, + "balance_loss_mlp": 1.02406502, + "epoch": 0.793122106655444, + "flos": 24532709705280.0, + "grad_norm": 4.942238723379798, + "language_loss": 0.80647039, + "learning_rate": 4.322518721063113e-07, + "loss": 0.83370721, + "num_input_tokens_seen": 142135505, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.2734375, + "step": 6596, + "time_per_iteration": 3.0136728286743164 + }, + { + "auxiliary_loss_clip": 0.0147698, + "auxiliary_loss_mlp": 0.01256288, + "balance_loss_clip": 1.1202544, + "balance_loss_mlp": 1.02683377, + "epoch": 0.7932423495460831, + "flos": 34423979497920.0, + "grad_norm": 1.908668333837988, + "language_loss": 0.7023626, + "learning_rate": 4.3176831207838906e-07, + "loss": 0.72969526, + "num_input_tokens_seen": 142158915, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.29296875, + "step": 6597, + "time_per_iteration": 3.0991780757904053 + }, + { + "auxiliary_loss_clip": 0.0147674, + "auxiliary_loss_mlp": 0.01254374, + "balance_loss_clip": 1.11881769, + "balance_loss_mlp": 1.02625465, + "epoch": 0.7933625924367221, + "flos": 26982817394400.0, + "grad_norm": 2.4018377963101143, + "language_loss": 0.7433964, + "learning_rate": 4.3128498994996685e-07, + "loss": 0.77070749, + "num_input_tokens_seen": 142178390, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 2.27734375, + "step": 6598, + "time_per_iteration": 3.0793650150299072 + }, + { + "auxiliary_loss_clip": 0.01474192, + "auxiliary_loss_mlp": 0.0125572, + "balance_loss_clip": 1.11748087, + "balance_loss_mlp": 1.02607501, + "epoch": 0.7934828353273613, + "flos": 29571136755840.0, + "grad_norm": 2.9908954466103577, + "language_loss": 0.7168076, + "learning_rate": 4.308019057943646e-07, + "loss": 0.74410677, + "num_input_tokens_seen": 142200115, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.29492188, + "step": 6599, + "time_per_iteration": 3.050588846206665 + }, + { + "auxiliary_loss_clip": 0.01473822, + "auxiliary_loss_mlp": 0.01255208, + "balance_loss_clip": 1.11561513, + "balance_loss_mlp": 1.02823377, + "epoch": 0.7936030782180004, + "flos": 28617575412960.0, + "grad_norm": 1.913653062183851, + "language_loss": 0.74536318, + "learning_rate": 4.3031905968486535e-07, + "loss": 0.77265346, + "num_input_tokens_seen": 142220945, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.26953125, + "step": 6600, + "time_per_iteration": 3.129265785217285 + }, + { + "auxiliary_loss_clip": 0.01475163, + "auxiliary_loss_mlp": 0.01265115, + "balance_loss_clip": 1.11720467, + "balance_loss_mlp": 1.03890347, + "epoch": 0.7937233211086394, + "flos": 16394079745440.0, + "grad_norm": 2.914587722083438, + "language_loss": 0.68300152, + "learning_rate": 4.298364516947162e-07, + "loss": 0.71040434, + "num_input_tokens_seen": 142238175, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.26367188, + "step": 6601, + "time_per_iteration": 4.778397560119629 + }, + { + "auxiliary_loss_clip": 0.01480429, + "auxiliary_loss_mlp": 0.01254145, + "balance_loss_clip": 1.1238482, + "balance_loss_mlp": 1.0256443, + "epoch": 0.7938435639992786, + "flos": 22015354659840.0, + "grad_norm": 2.029543711086696, + "language_loss": 0.65413988, + "learning_rate": 4.293540818971295e-07, + "loss": 0.68148559, + "num_input_tokens_seen": 142255980, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.28320312, + "step": 6602, + "time_per_iteration": 3.027421712875366 + }, + { + "auxiliary_loss_clip": 0.01476541, + "auxiliary_loss_mlp": 0.01252974, + "balance_loss_clip": 1.11968517, + "balance_loss_mlp": 1.02485466, + "epoch": 0.7939638068899176, + "flos": 22199042560320.0, + "grad_norm": 2.301889632357154, + "language_loss": 0.76697409, + "learning_rate": 4.2887195036527934e-07, + "loss": 0.79426926, + "num_input_tokens_seen": 142274785, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.28125, + "step": 6603, + "time_per_iteration": 2.964614152908325 + }, + { + "auxiliary_loss_clip": 0.01473315, + "auxiliary_loss_mlp": 0.01256693, + "balance_loss_clip": 1.11585689, + "balance_loss_mlp": 1.03010035, + "epoch": 0.7940840497805567, + "flos": 17746914836160.0, + "grad_norm": 3.383118309057966, + "language_loss": 0.73993027, + "learning_rate": 4.28390057172306e-07, + "loss": 0.76723039, + "num_input_tokens_seen": 142291290, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.26757812, + "step": 6604, + "time_per_iteration": 3.038670539855957 + }, + { + "auxiliary_loss_clip": 0.01475125, + "auxiliary_loss_mlp": 0.01258767, + "balance_loss_clip": 1.11824608, + "balance_loss_mlp": 1.0325551, + "epoch": 0.7942042926711959, + "flos": 23807705640480.0, + "grad_norm": 2.2962291094404224, + "language_loss": 0.71963346, + "learning_rate": 4.279084023913111e-07, + "loss": 0.74697238, + "num_input_tokens_seen": 142309165, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.26367188, + "step": 6605, + "time_per_iteration": 2.983400583267212 + }, + { + "auxiliary_loss_clip": 0.01479501, + "auxiliary_loss_mlp": 0.01267693, + "balance_loss_clip": 1.12332702, + "balance_loss_mlp": 1.04109991, + "epoch": 0.7943245355618349, + "flos": 19246988501280.0, + "grad_norm": 1.8353900988361187, + "language_loss": 0.69623923, + "learning_rate": 4.2742698609536096e-07, + "loss": 0.72371113, + "num_input_tokens_seen": 142327475, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.26367188, + "step": 6606, + "time_per_iteration": 3.8851146697998047 + }, + { + "auxiliary_loss_clip": 0.01475915, + "auxiliary_loss_mlp": 0.01257652, + "balance_loss_clip": 1.11953938, + "balance_loss_mlp": 1.02762604, + "epoch": 0.794444778452474, + "flos": 25009850694240.0, + "grad_norm": 2.1589014065137335, + "language_loss": 0.78724617, + "learning_rate": 4.2694580835748706e-07, + "loss": 0.81458187, + "num_input_tokens_seen": 142347335, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.29492188, + "step": 6607, + "time_per_iteration": 3.0566234588623047 + }, + { + "auxiliary_loss_clip": 0.01477575, + "auxiliary_loss_mlp": 0.01260713, + "balance_loss_clip": 1.12106347, + "balance_loss_mlp": 1.03526473, + "epoch": 0.7945650213431131, + "flos": 23223719931840.0, + "grad_norm": 2.712735141830899, + "language_loss": 0.74242198, + "learning_rate": 4.264648692506836e-07, + "loss": 0.76980489, + "num_input_tokens_seen": 142366125, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.2578125, + "step": 6608, + "time_per_iteration": 2.9270458221435547 + }, + { + "auxiliary_loss_clip": 0.01473718, + "auxiliary_loss_mlp": 0.01264376, + "balance_loss_clip": 1.11490536, + "balance_loss_mlp": 1.03778303, + "epoch": 0.7946852642337522, + "flos": 26065022670720.0, + "grad_norm": 1.8297960477306914, + "language_loss": 0.72224504, + "learning_rate": 4.2598416884790824e-07, + "loss": 0.74962598, + "num_input_tokens_seen": 142385175, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 2.26757812, + "step": 6609, + "time_per_iteration": 3.0391714572906494 + }, + { + "auxiliary_loss_clip": 0.01474538, + "auxiliary_loss_mlp": 0.01264996, + "balance_loss_clip": 1.11731446, + "balance_loss_mlp": 1.0374496, + "epoch": 0.7948055071243912, + "flos": 23772014877600.0, + "grad_norm": 2.775232028820234, + "language_loss": 0.81059003, + "learning_rate": 4.255037072220828e-07, + "loss": 0.8379854, + "num_input_tokens_seen": 142406545, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.27734375, + "step": 6610, + "time_per_iteration": 2.9812350273132324 + }, + { + "auxiliary_loss_clip": 0.0146914, + "auxiliary_loss_mlp": 0.01252819, + "balance_loss_clip": 1.1119504, + "balance_loss_mlp": 1.02679825, + "epoch": 0.7949257500150304, + "flos": 21982963646880.0, + "grad_norm": 1.991668326053287, + "language_loss": 0.72127914, + "learning_rate": 4.2502348444609293e-07, + "loss": 0.7484988, + "num_input_tokens_seen": 142426165, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.25976562, + "step": 6611, + "time_per_iteration": 2.956505537033081 + }, + { + "auxiliary_loss_clip": 0.01475176, + "auxiliary_loss_mlp": 0.01261506, + "balance_loss_clip": 1.11858797, + "balance_loss_mlp": 1.03300595, + "epoch": 0.7950459929056695, + "flos": 25776424386720.0, + "grad_norm": 1.8554223905847331, + "language_loss": 0.69473916, + "learning_rate": 4.2454350059278844e-07, + "loss": 0.72210598, + "num_input_tokens_seen": 142447225, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.28515625, + "step": 6612, + "time_per_iteration": 3.0306549072265625 + }, + { + "auxiliary_loss_clip": 0.01471969, + "auxiliary_loss_mlp": 0.01251833, + "balance_loss_clip": 1.11646962, + "balance_loss_mlp": 1.02523994, + "epoch": 0.7951662357963085, + "flos": 22159938263040.0, + "grad_norm": 2.0211015919164894, + "language_loss": 0.84563756, + "learning_rate": 4.240637557349824e-07, + "loss": 0.87287557, + "num_input_tokens_seen": 142464440, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.265625, + "step": 6613, + "time_per_iteration": 3.081240653991699 + }, + { + "auxiliary_loss_clip": 0.0147318, + "auxiliary_loss_mlp": 0.01256447, + "balance_loss_clip": 1.11661756, + "balance_loss_mlp": 1.03157079, + "epoch": 0.7952864786869477, + "flos": 24644105804160.0, + "grad_norm": 3.4421225656719048, + "language_loss": 0.66515064, + "learning_rate": 4.235842499454516e-07, + "loss": 0.69244689, + "num_input_tokens_seen": 142484355, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.25, + "step": 6614, + "time_per_iteration": 3.118434190750122 + }, + { + "auxiliary_loss_clip": 0.01476841, + "auxiliary_loss_mlp": 0.01261965, + "balance_loss_clip": 1.12057674, + "balance_loss_mlp": 1.03327394, + "epoch": 0.7954067215775867, + "flos": 21832842532320.0, + "grad_norm": 1.7261410400798696, + "language_loss": 0.82796317, + "learning_rate": 4.2310498329693687e-07, + "loss": 0.85535121, + "num_input_tokens_seen": 142505255, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.28515625, + "step": 6615, + "time_per_iteration": 3.066671371459961 + }, + { + "auxiliary_loss_clip": 0.01475401, + "auxiliary_loss_mlp": 0.0125378, + "balance_loss_clip": 1.11887169, + "balance_loss_mlp": 1.02318192, + "epoch": 0.7955269644682258, + "flos": 24062926779360.0, + "grad_norm": 1.8599946334798112, + "language_loss": 0.80996597, + "learning_rate": 4.2262595586214164e-07, + "loss": 0.83725774, + "num_input_tokens_seen": 142526350, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.30273438, + "step": 6616, + "time_per_iteration": 3.8328521251678467 + }, + { + "auxiliary_loss_clip": 0.0147712, + "auxiliary_loss_mlp": 0.01259946, + "balance_loss_clip": 1.12077439, + "balance_loss_mlp": 1.02782202, + "epoch": 0.795647207358865, + "flos": 25013150444160.0, + "grad_norm": 1.6084264579301852, + "language_loss": 0.76842642, + "learning_rate": 4.221471677137358e-07, + "loss": 0.79579711, + "num_input_tokens_seen": 142547165, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.31640625, + "step": 6617, + "time_per_iteration": 3.1463184356689453 + }, + { + "auxiliary_loss_clip": 0.01471104, + "auxiliary_loss_mlp": 0.01250498, + "balance_loss_clip": 1.11365461, + "balance_loss_mlp": 1.02352381, + "epoch": 0.795767450249504, + "flos": 14650163389440.0, + "grad_norm": 1.6214475485008992, + "language_loss": 0.70456654, + "learning_rate": 4.216686189243492e-07, + "loss": 0.73178256, + "num_input_tokens_seen": 142565955, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.27148438, + "step": 6618, + "time_per_iteration": 3.0293962955474854 + }, + { + "auxiliary_loss_clip": 0.01471727, + "auxiliary_loss_mlp": 0.0125161, + "balance_loss_clip": 1.11582327, + "balance_loss_mlp": 1.02043962, + "epoch": 0.7958876931401431, + "flos": 18549672357600.0, + "grad_norm": 1.7229131634954544, + "language_loss": 0.73213828, + "learning_rate": 4.211903095665785e-07, + "loss": 0.7593717, + "num_input_tokens_seen": 142585340, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.30664062, + "step": 6619, + "time_per_iteration": 3.0114901065826416 + }, + { + "auxiliary_loss_clip": 0.01470673, + "auxiliary_loss_mlp": 0.01252995, + "balance_loss_clip": 1.11438346, + "balance_loss_mlp": 1.02678299, + "epoch": 0.7960079360307821, + "flos": 21546140656320.0, + "grad_norm": 1.7637072546424246, + "language_loss": 0.74919426, + "learning_rate": 4.2071223971298277e-07, + "loss": 0.77643096, + "num_input_tokens_seen": 142602525, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 2.26171875, + "step": 6620, + "time_per_iteration": 3.130809783935547 + }, + { + "auxiliary_loss_clip": 0.01479216, + "auxiliary_loss_mlp": 0.01253635, + "balance_loss_clip": 1.12223935, + "balance_loss_mlp": 1.02513504, + "epoch": 0.7961281789214213, + "flos": 25483995358560.0, + "grad_norm": 7.193622566361063, + "language_loss": 0.61304885, + "learning_rate": 4.2023440943608433e-07, + "loss": 0.64037728, + "num_input_tokens_seen": 142622490, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.28320312, + "step": 6621, + "time_per_iteration": 3.079659938812256 + }, + { + "auxiliary_loss_clip": 0.01472588, + "auxiliary_loss_mlp": 0.01254987, + "balance_loss_clip": 1.11670721, + "balance_loss_mlp": 1.02915657, + "epoch": 0.7962484218120603, + "flos": 21946590177120.0, + "grad_norm": 1.7964207999325705, + "language_loss": 0.78336865, + "learning_rate": 4.1975681880837023e-07, + "loss": 0.81064439, + "num_input_tokens_seen": 142642495, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.25976562, + "step": 6622, + "time_per_iteration": 3.0301787853240967 + }, + { + "auxiliary_loss_clip": 0.01472699, + "auxiliary_loss_mlp": 0.01252098, + "balance_loss_clip": 1.11603177, + "balance_loss_mlp": 1.02397919, + "epoch": 0.7963686647026994, + "flos": 18878057645760.0, + "grad_norm": 1.8736462411796295, + "language_loss": 0.82598341, + "learning_rate": 4.192794679022895e-07, + "loss": 0.85323143, + "num_input_tokens_seen": 142660820, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.28125, + "step": 6623, + "time_per_iteration": 3.060448169708252 + }, + { + "auxiliary_loss_clip": 0.01472215, + "auxiliary_loss_mlp": 0.01244556, + "balance_loss_clip": 1.115165, + "balance_loss_mlp": 1.01815343, + "epoch": 0.7964889075933386, + "flos": 29719209749760.0, + "grad_norm": 1.868753513387351, + "language_loss": 0.72194219, + "learning_rate": 4.1880235679025743e-07, + "loss": 0.74910998, + "num_input_tokens_seen": 142680915, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.265625, + "step": 6624, + "time_per_iteration": 2.9749081134796143 + }, + { + "auxiliary_loss_clip": 0.01478789, + "auxiliary_loss_mlp": 0.01258436, + "balance_loss_clip": 1.12176728, + "balance_loss_mlp": 1.02993584, + "epoch": 0.7966091504839776, + "flos": 29493724652640.0, + "grad_norm": 2.0351166000223078, + "language_loss": 0.63658738, + "learning_rate": 4.1832548554464986e-07, + "loss": 0.66395962, + "num_input_tokens_seen": 142699210, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 2.28320312, + "step": 6625, + "time_per_iteration": 3.025811195373535 + }, + { + "auxiliary_loss_clip": 0.01429984, + "auxiliary_loss_mlp": 0.01195312, + "balance_loss_clip": 1.09026396, + "balance_loss_mlp": 1.00762939, + "epoch": 0.7967293933746167, + "flos": 67295012747040.0, + "grad_norm": 0.7572022572835947, + "language_loss": 0.58650941, + "learning_rate": 4.178488542378098e-07, + "loss": 0.61276239, + "num_input_tokens_seen": 142756790, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.875, + "step": 6626, + "time_per_iteration": 3.3556954860687256 + }, + { + "auxiliary_loss_clip": 0.0147702, + "auxiliary_loss_mlp": 0.01266396, + "balance_loss_clip": 1.12087893, + "balance_loss_mlp": 1.03846776, + "epoch": 0.7968496362652558, + "flos": 25556931938880.0, + "grad_norm": 1.7405304335096952, + "language_loss": 0.89015996, + "learning_rate": 4.173724629420401e-07, + "loss": 0.91759408, + "num_input_tokens_seen": 142778150, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.27929688, + "step": 6627, + "time_per_iteration": 3.0422708988189697 + }, + { + "auxiliary_loss_clip": 0.01477718, + "auxiliary_loss_mlp": 0.0126981, + "balance_loss_clip": 1.12164569, + "balance_loss_mlp": 1.03921199, + "epoch": 0.7969698791558949, + "flos": 14502811030560.0, + "grad_norm": 2.9940339987925726, + "language_loss": 0.6871742, + "learning_rate": 4.168963117296087e-07, + "loss": 0.7146495, + "num_input_tokens_seen": 142795485, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.3046875, + "step": 6628, + "time_per_iteration": 3.8275809288024902 + }, + { + "auxiliary_loss_clip": 0.0147776, + "auxiliary_loss_mlp": 0.01261297, + "balance_loss_clip": 1.1202271, + "balance_loss_mlp": 1.03203392, + "epoch": 0.797090122046534, + "flos": 22129747083360.0, + "grad_norm": 2.6399296233366747, + "language_loss": 0.75888276, + "learning_rate": 4.1642040067274876e-07, + "loss": 0.78627336, + "num_input_tokens_seen": 142815155, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.2890625, + "step": 6629, + "time_per_iteration": 3.848393678665161 + }, + { + "auxiliary_loss_clip": 0.01476067, + "auxiliary_loss_mlp": 0.01261421, + "balance_loss_clip": 1.12092745, + "balance_loss_mlp": 1.03559065, + "epoch": 0.7972103649371731, + "flos": 19899511123680.0, + "grad_norm": 1.5995432096619633, + "language_loss": 0.72859776, + "learning_rate": 4.1594472984365493e-07, + "loss": 0.75597262, + "num_input_tokens_seen": 142833840, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.25976562, + "step": 6630, + "time_per_iteration": 2.9392483234405518 + }, + { + "auxiliary_loss_clip": 0.01469826, + "auxiliary_loss_mlp": 0.01265244, + "balance_loss_clip": 1.11209548, + "balance_loss_mlp": 1.03540802, + "epoch": 0.7973306078278122, + "flos": 36061240775040.0, + "grad_norm": 1.7533608442823825, + "language_loss": 0.7745173, + "learning_rate": 4.154692993144862e-07, + "loss": 0.8018679, + "num_input_tokens_seen": 142853610, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.29882812, + "step": 6631, + "time_per_iteration": 3.148427724838257 + }, + { + "auxiliary_loss_clip": 0.01474676, + "auxiliary_loss_mlp": 0.0125724, + "balance_loss_clip": 1.11738956, + "balance_loss_mlp": 1.03064692, + "epoch": 0.7974508507184512, + "flos": 21362718252960.0, + "grad_norm": 4.394077230855872, + "language_loss": 0.7150979, + "learning_rate": 4.1499410915736476e-07, + "loss": 0.7424171, + "num_input_tokens_seen": 142872540, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.26757812, + "step": 6632, + "time_per_iteration": 2.9821674823760986 + }, + { + "auxiliary_loss_clip": 0.01429822, + "auxiliary_loss_mlp": 0.01195724, + "balance_loss_clip": 1.09020078, + "balance_loss_mlp": 1.00842285, + "epoch": 0.7975710936090904, + "flos": 68260369747680.0, + "grad_norm": 0.793816012156262, + "language_loss": 0.64262426, + "learning_rate": 4.145191594443762e-07, + "loss": 0.66887975, + "num_input_tokens_seen": 142936895, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 1.87109375, + "step": 6633, + "time_per_iteration": 3.5938215255737305 + }, + { + "auxiliary_loss_clip": 0.01475593, + "auxiliary_loss_mlp": 0.01253756, + "balance_loss_clip": 1.11957026, + "balance_loss_mlp": 1.02315712, + "epoch": 0.7976913364997295, + "flos": 22494657553920.0, + "grad_norm": 1.7177073391517306, + "language_loss": 0.70909226, + "learning_rate": 4.140444502475713e-07, + "loss": 0.7363857, + "num_input_tokens_seen": 142956445, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.30273438, + "step": 6634, + "time_per_iteration": 3.9020049571990967 + }, + { + "auxiliary_loss_clip": 0.01474738, + "auxiliary_loss_mlp": 0.01256949, + "balance_loss_clip": 1.11745143, + "balance_loss_mlp": 1.02863967, + "epoch": 0.7978115793903685, + "flos": 15264757487520.0, + "grad_norm": 2.0336317661363292, + "language_loss": 0.70431006, + "learning_rate": 4.1356998163896216e-07, + "loss": 0.73162687, + "num_input_tokens_seen": 142973495, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.28125, + "step": 6635, + "time_per_iteration": 3.0100014209747314 + }, + { + "auxiliary_loss_clip": 0.01474199, + "auxiliary_loss_mlp": 0.01263823, + "balance_loss_clip": 1.11698592, + "balance_loss_mlp": 1.03417861, + "epoch": 0.7979318222810077, + "flos": 19721284878240.0, + "grad_norm": 2.1527901029766503, + "language_loss": 0.75045496, + "learning_rate": 4.130957536905255e-07, + "loss": 0.77783513, + "num_input_tokens_seen": 142991510, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.29492188, + "step": 6636, + "time_per_iteration": 3.014583110809326 + }, + { + "auxiliary_loss_clip": 0.01479029, + "auxiliary_loss_mlp": 0.01262989, + "balance_loss_clip": 1.12315822, + "balance_loss_mlp": 1.03200865, + "epoch": 0.7980520651716467, + "flos": 15562117176480.0, + "grad_norm": 2.6686717604672237, + "language_loss": 0.71353352, + "learning_rate": 4.1262176647420134e-07, + "loss": 0.74095368, + "num_input_tokens_seen": 143009675, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.30664062, + "step": 6637, + "time_per_iteration": 3.008145809173584 + }, + { + "auxiliary_loss_clip": 0.01477455, + "auxiliary_loss_mlp": 0.01254294, + "balance_loss_clip": 1.12158918, + "balance_loss_mlp": 1.02636611, + "epoch": 0.7981723080622858, + "flos": 22311614432160.0, + "grad_norm": 1.7572484100810783, + "language_loss": 0.80015695, + "learning_rate": 4.121480200618923e-07, + "loss": 0.82747436, + "num_input_tokens_seen": 143029330, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 2.27734375, + "step": 6638, + "time_per_iteration": 3.021318197250366 + }, + { + "auxiliary_loss_clip": 0.01476527, + "auxiliary_loss_mlp": 0.01252635, + "balance_loss_clip": 1.12069178, + "balance_loss_mlp": 1.02527881, + "epoch": 0.798292550952925, + "flos": 22931783969760.0, + "grad_norm": 2.130143112267924, + "language_loss": 0.80189151, + "learning_rate": 4.116745145254674e-07, + "loss": 0.8291831, + "num_input_tokens_seen": 143048865, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 2.2734375, + "step": 6639, + "time_per_iteration": 3.0760538578033447 + }, + { + "auxiliary_loss_clip": 0.01429289, + "auxiliary_loss_mlp": 0.01190895, + "balance_loss_clip": 1.09022593, + "balance_loss_mlp": 1.00626373, + "epoch": 0.798412793843564, + "flos": 64505140957440.0, + "grad_norm": 0.768952436199426, + "language_loss": 0.57904315, + "learning_rate": 4.1120124993675476e-07, + "loss": 0.60524499, + "num_input_tokens_seen": 143113295, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.84375, + "step": 6640, + "time_per_iteration": 3.5420727729797363 + }, + { + "auxiliary_loss_clip": 0.01476679, + "auxiliary_loss_mlp": 0.01257562, + "balance_loss_clip": 1.12013721, + "balance_loss_mlp": 1.02658153, + "epoch": 0.7985330367342031, + "flos": 13588467769440.0, + "grad_norm": 2.921197733501969, + "language_loss": 0.61897576, + "learning_rate": 4.107282263675498e-07, + "loss": 0.64631814, + "num_input_tokens_seen": 143130965, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 2.30664062, + "step": 6641, + "time_per_iteration": 2.974066734313965 + }, + { + "auxiliary_loss_clip": 0.01430107, + "auxiliary_loss_mlp": 0.01193306, + "balance_loss_clip": 1.09047914, + "balance_loss_mlp": 1.00867462, + "epoch": 0.7986532796248422, + "flos": 67705627014720.0, + "grad_norm": 0.7888594345027222, + "language_loss": 0.52436972, + "learning_rate": 4.1025544388960907e-07, + "loss": 0.55060387, + "num_input_tokens_seen": 143192005, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 1.84375, + "step": 6642, + "time_per_iteration": 3.401599168777466 + }, + { + "auxiliary_loss_clip": 0.01477404, + "auxiliary_loss_mlp": 0.01255239, + "balance_loss_clip": 1.12023687, + "balance_loss_mlp": 1.02845538, + "epoch": 0.7987735225154813, + "flos": 22457563449120.0, + "grad_norm": 1.8368803982677513, + "language_loss": 0.71958768, + "learning_rate": 4.097829025746538e-07, + "loss": 0.74691415, + "num_input_tokens_seen": 143213550, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 2.26953125, + "step": 6643, + "time_per_iteration": 3.825347900390625 + }, + { + "auxiliary_loss_clip": 0.01429347, + "auxiliary_loss_mlp": 0.01193939, + "balance_loss_clip": 1.09031236, + "balance_loss_mlp": 1.00549316, + "epoch": 0.7988937654061203, + "flos": 68870867604480.0, + "grad_norm": 0.659397046615115, + "language_loss": 0.60948569, + "learning_rate": 4.0931060249436757e-07, + "loss": 0.63571852, + "num_input_tokens_seen": 143277390, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 1.8828125, + "step": 6644, + "time_per_iteration": 3.404906749725342 + }, + { + "auxiliary_loss_clip": 0.01481072, + "auxiliary_loss_mlp": 0.01257577, + "balance_loss_clip": 1.12391889, + "balance_loss_mlp": 1.03251004, + "epoch": 0.7990140082967595, + "flos": 20808620298720.0, + "grad_norm": 2.9854421237105924, + "language_loss": 0.69880128, + "learning_rate": 4.088385437203978e-07, + "loss": 0.72618777, + "num_input_tokens_seen": 143294400, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.25390625, + "step": 6645, + "time_per_iteration": 3.043691396713257 + }, + { + "auxiliary_loss_clip": 0.01474268, + "auxiliary_loss_mlp": 0.01259263, + "balance_loss_clip": 1.11681676, + "balance_loss_mlp": 1.02885556, + "epoch": 0.7991342511873986, + "flos": 18987064270560.0, + "grad_norm": 2.09079418660073, + "language_loss": 0.77564442, + "learning_rate": 4.083667263243564e-07, + "loss": 0.80297977, + "num_input_tokens_seen": 143312745, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 2.30078125, + "step": 6646, + "time_per_iteration": 2.9957690238952637 + }, + { + "auxiliary_loss_clip": 0.01477377, + "auxiliary_loss_mlp": 0.01252256, + "balance_loss_clip": 1.12006354, + "balance_loss_mlp": 1.02547264, + "epoch": 0.7992544940780376, + "flos": 20819088470880.0, + "grad_norm": 1.851695164210104, + "language_loss": 0.71632302, + "learning_rate": 4.0789515037781653e-07, + "loss": 0.74361938, + "num_input_tokens_seen": 143333470, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.26953125, + "step": 6647, + "time_per_iteration": 2.953221321105957 + }, + { + "auxiliary_loss_clip": 0.01475736, + "auxiliary_loss_mlp": 0.0125512, + "balance_loss_clip": 1.12031388, + "balance_loss_mlp": 1.02566612, + "epoch": 0.7993747369686768, + "flos": 12642416202240.0, + "grad_norm": 2.1401087473232088, + "language_loss": 0.82526469, + "learning_rate": 4.0742381595231755e-07, + "loss": 0.85257328, + "num_input_tokens_seen": 143350195, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 2.2890625, + "step": 6648, + "time_per_iteration": 2.968315362930298 + }, + { + "auxiliary_loss_clip": 0.01476062, + "auxiliary_loss_mlp": 0.01260447, + "balance_loss_clip": 1.11905241, + "balance_loss_mlp": 1.03118372, + "epoch": 0.7994949798593158, + "flos": 20080544052960.0, + "grad_norm": 2.0767265579319445, + "language_loss": 0.78312051, + "learning_rate": 4.06952723119359e-07, + "loss": 0.81048566, + "num_input_tokens_seen": 143370070, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.2890625, + "step": 6649, + "time_per_iteration": 2.9828567504882812 + }, + { + "auxiliary_loss_clip": 0.01476992, + "auxiliary_loss_mlp": 0.01256081, + "balance_loss_clip": 1.1202873, + "balance_loss_mlp": 1.02891588, + "epoch": 0.7996152227499549, + "flos": 38657259552960.0, + "grad_norm": 1.9394951040168074, + "language_loss": 0.67272252, + "learning_rate": 4.0648187195040504e-07, + "loss": 0.70005327, + "num_input_tokens_seen": 143392275, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 2.27539062, + "step": 6650, + "time_per_iteration": 3.193856954574585 + }, + { + "auxiliary_loss_clip": 0.0142889, + "auxiliary_loss_mlp": 0.0119397, + "balance_loss_clip": 1.08949542, + "balance_loss_mlp": 1.00933838, + "epoch": 0.799735465640594, + "flos": 70250290699680.0, + "grad_norm": 0.814191419664671, + "language_loss": 0.67545617, + "learning_rate": 4.060112625168848e-07, + "loss": 0.70168471, + "num_input_tokens_seen": 143457385, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 1.84375, + "step": 6651, + "time_per_iteration": 3.530445098876953 + }, + { + "auxiliary_loss_clip": 0.01477628, + "auxiliary_loss_mlp": 0.01257448, + "balance_loss_clip": 1.12161851, + "balance_loss_mlp": 1.02742159, + "epoch": 0.7998557085312331, + "flos": 24243163217280.0, + "grad_norm": 2.9675891376420145, + "language_loss": 0.73912144, + "learning_rate": 4.055408948901886e-07, + "loss": 0.7664721, + "num_input_tokens_seen": 143478785, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 2.29882812, + "step": 6652, + "time_per_iteration": 2.930379867553711 + }, + { + "auxiliary_loss_clip": 0.01479998, + "auxiliary_loss_mlp": 0.0125772, + "balance_loss_clip": 1.12253129, + "balance_loss_mlp": 1.03074527, + "epoch": 0.7999759514218722, + "flos": 27566537605920.0, + "grad_norm": 1.8492878126725747, + "language_loss": 0.71388781, + "learning_rate": 4.050707691416708e-07, + "loss": 0.74126494, + "num_input_tokens_seen": 143500095, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 2.27148438, + "step": 6653, + "time_per_iteration": 3.037081241607666 + }, + { + "auxiliary_loss_clip": 0.01428232, + "auxiliary_loss_mlp": 0.01192696, + "balance_loss_clip": 1.08864582, + "balance_loss_mlp": 1.00806427, + "epoch": 0.8000961943125112, + "flos": 67344433503840.0, + "grad_norm": 0.6738269990922673, + "language_loss": 0.59697628, + "learning_rate": 4.046008853426495e-07, + "loss": 0.62318552, + "num_input_tokens_seen": 143563410, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 1.84375, + "step": 6654, + "time_per_iteration": 3.463408946990967 + }, + { + "auxiliary_loss_clip": 0.01473953, + "auxiliary_loss_mlp": 0.01255193, + "balance_loss_clip": 1.11686718, + "balance_loss_mlp": 1.02669263, + "epoch": 0.8002164372031504, + "flos": 28736519215680.0, + "grad_norm": 3.84609392301801, + "language_loss": 0.62503558, + "learning_rate": 4.0413124356440464e-07, + "loss": 0.65232706, + "num_input_tokens_seen": 143587455, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 2.28320312, + "step": 6655, + "time_per_iteration": 3.8803112506866455 + }, + { + "auxiliary_loss_clip": 0.01473006, + "auxiliary_loss_mlp": 0.01267304, + "balance_loss_clip": 1.1162014, + "balance_loss_mlp": 1.03765869, + "epoch": 0.8003366800937894, + "flos": 17641359673920.0, + "grad_norm": 1.9217647560879731, + "language_loss": 0.82357264, + "learning_rate": 4.0366184387818223e-07, + "loss": 0.85097575, + "num_input_tokens_seen": 143605915, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 2.29492188, + "step": 6656, + "time_per_iteration": 3.8952364921569824 + } + ], + "logging_steps": 1.0, + "max_steps": 8316, + "num_input_tokens_seen": 143605915, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.5934662084028006e+17, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}